/** * Import a PDF file into a vault note (plain text extracted via unpdf / PDF.js). */ import '../shims/promise-try.mjs'; import crypto from 'crypto'; import fs from 'fs'; import path from 'path'; import { extractText, getDocumentProxy } from 'unpdf'; import { writeNote } from '../write.mjs'; import { normalizeSlug } from '../vault.mjs'; /** * Stable id from file bytes (hex, 32 chars). * @param {Buffer} buf */ function sourceIdFromPdfBytes(buf) { return crypto.createHash('sha256').update(buf).digest('hex').slice(0, 32); } /** * @param {string} inputPath */ function titleFromPdfFilename(inputPath) { const base = path.basename(inputPath, path.extname(inputPath)); const cleaned = base.replace(/[-_]+/g, ' ').trim(); return cleaned || 'Imported PDF'; } /** * @param {string} text */ function normalizeExtractedText(text) { let t = String(text || '').replace(/\r\n/g, '\n'); t = t.replace(/\u00a0/g, ' '); t = t.replace(/\n{3,}/g, '\n\n').trim(); return t; } /** * @param {string} input - Path to a .pdf file * @param {{ * vaultPath: string, * outputBase: string, * project?: string | null, * tags: string[], * dryRun: boolean, * onProgress?: (p: { progress: number, total?: number, message?: string }) => void | Promise * }} ctx * @returns {Promise<{ imported: { path: string, source_id?: string }[], count: number }>} */ export async function importPdf(input, ctx) { const raw = typeof input === 'string' ? input.trim() : ''; if (!raw) throw new Error('PDF path is required'); const { vaultPath, outputBase, project, tags, dryRun, onProgress } = ctx; if (onProgress) await onProgress({ progress: 0, total: 1, message: 'Reading PDF…' }); const absInput = path.isAbsolute(raw) ? raw : path.resolve(process.cwd(), raw); if (!fs.existsSync(absInput)) { throw new Error(`Input not found: ${input}`); } if (!fs.statSync(absInput).isFile()) { throw new Error(`PDF import requires a single .pdf file (not a directory): ${input}`); } if (!absInput.toLowerCase().endsWith('.pdf')) { throw new Error(`PDF import requires a .pdf file; got: ${path.basename(absInput)}`); } const buf = fs.readFileSync(absInput); const source_id = sourceIdFromPdfBytes(buf); const short = source_id.slice(0, 12); const outputRel = path.join(outputBase, 'imports', 'pdf', `${short}.md`).replace(/\\/g, '/'); const pdf = await getDocumentProxy(new Uint8Array(buf)); const { totalPages, text } = await extractText(pdf, { mergePages: true }); const bodyText = normalizeExtractedText(text); if (!bodyText) { throw new Error('Could not extract text from this PDF (empty or image-only)'); } const now = new Date().toISOString().slice(0, 10); const baseName = path.basename(absInput); const title = titleFromPdfFilename(absInput); const body = bodyText + '\n\n---\n\n' + `_Imported from PDF:_ \`${baseName}\` · ${totalPages} page(s).\n`; const merged = { title, date: now, source: 'pdf-import', source_id, pdf_file: baseName, pdf_pages: totalPages, ...(project && { project: normalizeSlug(project) }), ...(tags.length && { tags }), }; if (typeof merged.tags === 'string') merged.tags = tags; else if (Array.isArray(merged.tags)) merged.tags = [...new Set([...merged.tags, ...tags])]; else merged.tags = tags; if (!dryRun) { writeNote(vaultPath, outputRel, { body, frontmatter: Object.fromEntries(Object.entries(merged).filter(([, v]) => v !== undefined && v !== null && v !== '')), }); } if (onProgress) await onProgress({ progress: 1, total: 1, message: 'Done' }); return { imported: [{ path: outputRel, source_id }], count: 1 }; }