docx.mjs
128 lines 4.0 KB
Raw
sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd feat(calendar): enforce agent context tiers in retrieval AP… Human minor ⚠ breaking 1 day ago
1 /**
2 * Import a .docx file into a vault note (Markdown via mammoth).
3 */
4
5 import crypto from 'crypto';
6 import fs from 'fs';
7 import path from 'path';
8 import mammoth from 'mammoth';
9 import { writeNote } from '../write.mjs';
10 import { normalizeSlug } from '../vault.mjs';
11
12 /**
13 * Stable id from file bytes (hex, 32 chars).
14 * @param {Buffer} buf
15 */
16 function sourceIdFromDocxBytes(buf) {
17 return crypto.createHash('sha256').update(buf).digest('hex').slice(0, 32);
18 }
19
20 /**
21 * @param {string} inputPath
22 */
23 function titleFromDocxFilename(inputPath) {
24 const base = path.basename(inputPath, path.extname(inputPath));
25 const cleaned = base.replace(/[-_]+/g, ' ').trim();
26 return cleaned || 'Imported DOCX';
27 }
28
29 /**
30 * @param {string} md
31 */
32 function normalizeMarkdownBody(md) {
33 let t = String(md || '').replace(/\r\n/g, '\n');
34 t = t.replace(/\u00a0/g, ' ');
35 t = t.replace(/\n{3,}/g, '\n\n').trim();
36 return t;
37 }
38
39 /**
40 * @param {string} input - Path to a .docx file
41 * @param {{
42 * vaultPath: string,
43 * outputBase: string,
44 * project?: string | null,
45 * tags: string[],
46 * dryRun: boolean,
47 * onProgress?: (p: { progress: number, total?: number, message?: string }) => void | Promise<void>
48 * }} ctx
49 * @returns {Promise<{ imported: { path: string, source_id?: string }[], count: number }>}
50 */
51 export async function importDocx(input, ctx) {
52 const raw = typeof input === 'string' ? input.trim() : '';
53 if (!raw) throw new Error('DOCX path is required');
54
55 const { vaultPath, outputBase, project, tags, dryRun, onProgress } = ctx;
56 if (onProgress) await onProgress({ progress: 0, total: 1, message: 'Reading DOCX…' });
57
58 const absInput = path.isAbsolute(raw) ? raw : path.resolve(process.cwd(), raw);
59 if (!fs.existsSync(absInput)) {
60 throw new Error(`Input not found: ${input}`);
61 }
62 if (!fs.statSync(absInput).isFile()) {
63 throw new Error(`DOCX import requires a single .docx file (not a directory): ${input}`);
64 }
65 if (!absInput.toLowerCase().endsWith('.docx')) {
66 throw new Error(`DOCX import requires a .docx file; got: ${path.basename(absInput)}`);
67 }
68
69 const buf = fs.readFileSync(absInput);
70 const source_id = sourceIdFromDocxBytes(buf);
71 const short = source_id.slice(0, 12);
72 const outputRel = path.join(outputBase, 'imports', 'docx', `${short}.md`).replace(/\\/g, '/');
73
74 let result;
75 try {
76 result = await mammoth.convertToMarkdown({ buffer: buf });
77 } catch (e) {
78 const msg = e && typeof e.message === 'string' ? e.message : String(e);
79 throw new Error(`Could not read this DOCX (corrupt or not a Word document): ${msg}`);
80 }
81
82 const bodyMd = normalizeMarkdownBody(result.value);
83 if (!bodyMd) {
84 throw new Error('Could not convert this DOCX to usable text (empty document)');
85 }
86
87 const now = new Date().toISOString().slice(0, 10);
88 const baseName = path.basename(absInput);
89 const title = titleFromDocxFilename(absInput);
90
91 let body =
92 bodyMd +
93 '\n\n---\n\n' +
94 `_Imported from DOCX:_ \`${baseName}\`.\n`;
95
96 if (result.messages && result.messages.length > 0) {
97 const lines = result.messages
98 .map((m) => (m && typeof m.message === 'string' ? m.message.trim() : ''))
99 .filter(Boolean);
100 if (lines.length) {
101 body += '\n_Conversion notes:_\n\n' + lines.map((l) => `- ${l}`).join('\n') + '\n';
102 }
103 }
104
105 const merged = {
106 title,
107 date: now,
108 source: 'docx-import',
109 source_id,
110 docx_file: baseName,
111 ...(project && { project: normalizeSlug(project) }),
112 ...(tags.length && { tags }),
113 };
114 if (typeof merged.tags === 'string') merged.tags = tags;
115 else if (Array.isArray(merged.tags)) merged.tags = [...new Set([...merged.tags, ...tags])];
116 else merged.tags = tags;
117
118 if (!dryRun) {
119 writeNote(vaultPath, outputRel, {
120 body,
121 frontmatter: Object.fromEntries(Object.entries(merged).filter(([, v]) => v !== undefined && v !== null && v !== '')),
122 });
123 }
124
125 if (onProgress) await onProgress({ progress: 1, total: 1, message: 'Done' });
126
127 return { imported: [{ path: outputRel, source_id }], count: 1 };
128 }
File History 2 commits
sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd feat(calendar): enforce agent context tiers in retrieval AP… Human minor 1 day ago
sha256:9103f98c89257ed2b01c237cea895dabb3e85ea337dccb1161c175e4422355b6 docs: accept Calendar Events v0 spec with Phase 0 security … Human 1 day ago