pdf.mjs
115 lines 3.6 KB
Raw
sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd feat(calendar): enforce agent context tiers in retrieval AP… Human minor ⚠ breaking 1 day ago
1 /**
2 * Import a PDF file into a vault note (plain text extracted via unpdf / PDF.js).
3 */
4
5 import '../shims/promise-try.mjs';
6 import crypto from 'crypto';
7 import fs from 'fs';
8 import path from 'path';
9 import { extractText, getDocumentProxy } from 'unpdf';
10 import { writeNote } from '../write.mjs';
11 import { normalizeSlug } from '../vault.mjs';
12
13 /**
14 * Stable id from file bytes (hex, 32 chars).
15 * @param {Buffer} buf
16 */
17 function sourceIdFromPdfBytes(buf) {
18 return crypto.createHash('sha256').update(buf).digest('hex').slice(0, 32);
19 }
20
21 /**
22 * @param {string} inputPath
23 */
24 function titleFromPdfFilename(inputPath) {
25 const base = path.basename(inputPath, path.extname(inputPath));
26 const cleaned = base.replace(/[-_]+/g, ' ').trim();
27 return cleaned || 'Imported PDF';
28 }
29
30 /**
31 * @param {string} text
32 */
33 function normalizeExtractedText(text) {
34 let t = String(text || '').replace(/\r\n/g, '\n');
35 t = t.replace(/\u00a0/g, ' ');
36 t = t.replace(/\n{3,}/g, '\n\n').trim();
37 return t;
38 }
39
40 /**
41 * @param {string} input - Path to a .pdf file
42 * @param {{
43 * vaultPath: string,
44 * outputBase: string,
45 * project?: string | null,
46 * tags: string[],
47 * dryRun: boolean,
48 * onProgress?: (p: { progress: number, total?: number, message?: string }) => void | Promise<void>
49 * }} ctx
50 * @returns {Promise<{ imported: { path: string, source_id?: string }[], count: number }>}
51 */
52 export async function importPdf(input, ctx) {
53 const raw = typeof input === 'string' ? input.trim() : '';
54 if (!raw) throw new Error('PDF path is required');
55
56 const { vaultPath, outputBase, project, tags, dryRun, onProgress } = ctx;
57 if (onProgress) await onProgress({ progress: 0, total: 1, message: 'Reading PDF…' });
58
59 const absInput = path.isAbsolute(raw) ? raw : path.resolve(process.cwd(), raw);
60 if (!fs.existsSync(absInput)) {
61 throw new Error(`Input not found: ${input}`);
62 }
63 if (!fs.statSync(absInput).isFile()) {
64 throw new Error(`PDF import requires a single .pdf file (not a directory): ${input}`);
65 }
66 if (!absInput.toLowerCase().endsWith('.pdf')) {
67 throw new Error(`PDF import requires a .pdf file; got: ${path.basename(absInput)}`);
68 }
69
70 const buf = fs.readFileSync(absInput);
71 const source_id = sourceIdFromPdfBytes(buf);
72 const short = source_id.slice(0, 12);
73 const outputRel = path.join(outputBase, 'imports', 'pdf', `${short}.md`).replace(/\\/g, '/');
74
75 const pdf = await getDocumentProxy(new Uint8Array(buf));
76 const { totalPages, text } = await extractText(pdf, { mergePages: true });
77 const bodyText = normalizeExtractedText(text);
78 if (!bodyText) {
79 throw new Error('Could not extract text from this PDF (empty or image-only)');
80 }
81
82 const now = new Date().toISOString().slice(0, 10);
83 const baseName = path.basename(absInput);
84 const title = titleFromPdfFilename(absInput);
85
86 const body =
87 bodyText +
88 '\n\n---\n\n' +
89 `_Imported from PDF:_ \`${baseName}\` · ${totalPages} page(s).\n`;
90
91 const merged = {
92 title,
93 date: now,
94 source: 'pdf-import',
95 source_id,
96 pdf_file: baseName,
97 pdf_pages: totalPages,
98 ...(project && { project: normalizeSlug(project) }),
99 ...(tags.length && { tags }),
100 };
101 if (typeof merged.tags === 'string') merged.tags = tags;
102 else if (Array.isArray(merged.tags)) merged.tags = [...new Set([...merged.tags, ...tags])];
103 else merged.tags = tags;
104
105 if (!dryRun) {
106 writeNote(vaultPath, outputRel, {
107 body,
108 frontmatter: Object.fromEntries(Object.entries(merged).filter(([, v]) => v !== undefined && v !== null && v !== '')),
109 });
110 }
111
112 if (onProgress) await onProgress({ progress: 1, total: 1, message: 'Done' });
113
114 return { imported: [{ path: outputRel, source_id }], count: 1 };
115 }
File History 2 commits
sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd feat(calendar): enforce agent context tiers in retrieval AP… Human minor 1 day ago
sha256:9103f98c89257ed2b01c237cea895dabb3e85ea337dccb1161c175e4422355b6 docs: accept Calendar Events v0 spec with Phase 0 security … Human 1 day ago