pdf.mjs
sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd
feat(calendar): enforce agent context tiers in retrieval AP…
Human
minor
⚠ breaking
1 day ago
| 1 | /** |
| 2 | * Import a PDF file into a vault note (plain text extracted via unpdf / PDF.js). |
| 3 | */ |
| 4 | |
| 5 | import '../shims/promise-try.mjs'; |
| 6 | import crypto from 'crypto'; |
| 7 | import fs from 'fs'; |
| 8 | import path from 'path'; |
| 9 | import { extractText, getDocumentProxy } from 'unpdf'; |
| 10 | import { writeNote } from '../write.mjs'; |
| 11 | import { normalizeSlug } from '../vault.mjs'; |
| 12 | |
| 13 | /** |
| 14 | * Stable id from file bytes (hex, 32 chars). |
| 15 | * @param {Buffer} buf |
| 16 | */ |
| 17 | function sourceIdFromPdfBytes(buf) { |
| 18 | return crypto.createHash('sha256').update(buf).digest('hex').slice(0, 32); |
| 19 | } |
| 20 | |
| 21 | /** |
| 22 | * @param {string} inputPath |
| 23 | */ |
| 24 | function titleFromPdfFilename(inputPath) { |
| 25 | const base = path.basename(inputPath, path.extname(inputPath)); |
| 26 | const cleaned = base.replace(/[-_]+/g, ' ').trim(); |
| 27 | return cleaned || 'Imported PDF'; |
| 28 | } |
| 29 | |
| 30 | /** |
| 31 | * @param {string} text |
| 32 | */ |
| 33 | function normalizeExtractedText(text) { |
| 34 | let t = String(text || '').replace(/\r\n/g, '\n'); |
| 35 | t = t.replace(/\u00a0/g, ' '); |
| 36 | t = t.replace(/\n{3,}/g, '\n\n').trim(); |
| 37 | return t; |
| 38 | } |
| 39 | |
| 40 | /** |
| 41 | * @param {string} input - Path to a .pdf file |
| 42 | * @param {{ |
| 43 | * vaultPath: string, |
| 44 | * outputBase: string, |
| 45 | * project?: string | null, |
| 46 | * tags: string[], |
| 47 | * dryRun: boolean, |
| 48 | * onProgress?: (p: { progress: number, total?: number, message?: string }) => void | Promise<void> |
| 49 | * }} ctx |
| 50 | * @returns {Promise<{ imported: { path: string, source_id?: string }[], count: number }>} |
| 51 | */ |
| 52 | export async function importPdf(input, ctx) { |
| 53 | const raw = typeof input === 'string' ? input.trim() : ''; |
| 54 | if (!raw) throw new Error('PDF path is required'); |
| 55 | |
| 56 | const { vaultPath, outputBase, project, tags, dryRun, onProgress } = ctx; |
| 57 | if (onProgress) await onProgress({ progress: 0, total: 1, message: 'Reading PDF…' }); |
| 58 | |
| 59 | const absInput = path.isAbsolute(raw) ? raw : path.resolve(process.cwd(), raw); |
| 60 | if (!fs.existsSync(absInput)) { |
| 61 | throw new Error(`Input not found: ${input}`); |
| 62 | } |
| 63 | if (!fs.statSync(absInput).isFile()) { |
| 64 | throw new Error(`PDF import requires a single .pdf file (not a directory): ${input}`); |
| 65 | } |
| 66 | if (!absInput.toLowerCase().endsWith('.pdf')) { |
| 67 | throw new Error(`PDF import requires a .pdf file; got: ${path.basename(absInput)}`); |
| 68 | } |
| 69 | |
| 70 | const buf = fs.readFileSync(absInput); |
| 71 | const source_id = sourceIdFromPdfBytes(buf); |
| 72 | const short = source_id.slice(0, 12); |
| 73 | const outputRel = path.join(outputBase, 'imports', 'pdf', `${short}.md`).replace(/\\/g, '/'); |
| 74 | |
| 75 | const pdf = await getDocumentProxy(new Uint8Array(buf)); |
| 76 | const { totalPages, text } = await extractText(pdf, { mergePages: true }); |
| 77 | const bodyText = normalizeExtractedText(text); |
| 78 | if (!bodyText) { |
| 79 | throw new Error('Could not extract text from this PDF (empty or image-only)'); |
| 80 | } |
| 81 | |
| 82 | const now = new Date().toISOString().slice(0, 10); |
| 83 | const baseName = path.basename(absInput); |
| 84 | const title = titleFromPdfFilename(absInput); |
| 85 | |
| 86 | const body = |
| 87 | bodyText + |
| 88 | '\n\n---\n\n' + |
| 89 | `_Imported from PDF:_ \`${baseName}\` · ${totalPages} page(s).\n`; |
| 90 | |
| 91 | const merged = { |
| 92 | title, |
| 93 | date: now, |
| 94 | source: 'pdf-import', |
| 95 | source_id, |
| 96 | pdf_file: baseName, |
| 97 | pdf_pages: totalPages, |
| 98 | ...(project && { project: normalizeSlug(project) }), |
| 99 | ...(tags.length && { tags }), |
| 100 | }; |
| 101 | if (typeof merged.tags === 'string') merged.tags = tags; |
| 102 | else if (Array.isArray(merged.tags)) merged.tags = [...new Set([...merged.tags, ...tags])]; |
| 103 | else merged.tags = tags; |
| 104 | |
| 105 | if (!dryRun) { |
| 106 | writeNote(vaultPath, outputRel, { |
| 107 | body, |
| 108 | frontmatter: Object.fromEntries(Object.entries(merged).filter(([, v]) => v !== undefined && v !== null && v !== '')), |
| 109 | }); |
| 110 | } |
| 111 | |
| 112 | if (onProgress) await onProgress({ progress: 1, total: 1, message: 'Done' }); |
| 113 | |
| 114 | return { imported: [{ path: outputRel, source_id }], count: 1 }; |
| 115 | } |
File History
2 commits
sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd
feat(calendar): enforce agent context tiers in retrieval AP…
Human
minor
⚠
1 day ago
sha256:9103f98c89257ed2b01c237cea895dabb3e85ea337dccb1161c175e4422355b6
docs: accept Calendar Events v0 spec with Phase 0 security …
Human
1 day ago