docx.mjs
sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd
feat(calendar): enforce agent context tiers in retrieval AP…
Human
minor
⚠ breaking
1 day ago
| 1 | /** |
| 2 | * Import a .docx file into a vault note (Markdown via mammoth). |
| 3 | */ |
| 4 | |
| 5 | import crypto from 'crypto'; |
| 6 | import fs from 'fs'; |
| 7 | import path from 'path'; |
| 8 | import mammoth from 'mammoth'; |
| 9 | import { writeNote } from '../write.mjs'; |
| 10 | import { normalizeSlug } from '../vault.mjs'; |
| 11 | |
| 12 | /** |
| 13 | * Stable id from file bytes (hex, 32 chars). |
| 14 | * @param {Buffer} buf |
| 15 | */ |
| 16 | function sourceIdFromDocxBytes(buf) { |
| 17 | return crypto.createHash('sha256').update(buf).digest('hex').slice(0, 32); |
| 18 | } |
| 19 | |
| 20 | /** |
| 21 | * @param {string} inputPath |
| 22 | */ |
| 23 | function titleFromDocxFilename(inputPath) { |
| 24 | const base = path.basename(inputPath, path.extname(inputPath)); |
| 25 | const cleaned = base.replace(/[-_]+/g, ' ').trim(); |
| 26 | return cleaned || 'Imported DOCX'; |
| 27 | } |
| 28 | |
| 29 | /** |
| 30 | * @param {string} md |
| 31 | */ |
| 32 | function normalizeMarkdownBody(md) { |
| 33 | let t = String(md || '').replace(/\r\n/g, '\n'); |
| 34 | t = t.replace(/\u00a0/g, ' '); |
| 35 | t = t.replace(/\n{3,}/g, '\n\n').trim(); |
| 36 | return t; |
| 37 | } |
| 38 | |
| 39 | /** |
| 40 | * @param {string} input - Path to a .docx file |
| 41 | * @param {{ |
| 42 | * vaultPath: string, |
| 43 | * outputBase: string, |
| 44 | * project?: string | null, |
| 45 | * tags: string[], |
| 46 | * dryRun: boolean, |
| 47 | * onProgress?: (p: { progress: number, total?: number, message?: string }) => void | Promise<void> |
| 48 | * }} ctx |
| 49 | * @returns {Promise<{ imported: { path: string, source_id?: string }[], count: number }>} |
| 50 | */ |
| 51 | export async function importDocx(input, ctx) { |
| 52 | const raw = typeof input === 'string' ? input.trim() : ''; |
| 53 | if (!raw) throw new Error('DOCX path is required'); |
| 54 | |
| 55 | const { vaultPath, outputBase, project, tags, dryRun, onProgress } = ctx; |
| 56 | if (onProgress) await onProgress({ progress: 0, total: 1, message: 'Reading DOCX…' }); |
| 57 | |
| 58 | const absInput = path.isAbsolute(raw) ? raw : path.resolve(process.cwd(), raw); |
| 59 | if (!fs.existsSync(absInput)) { |
| 60 | throw new Error(`Input not found: ${input}`); |
| 61 | } |
| 62 | if (!fs.statSync(absInput).isFile()) { |
| 63 | throw new Error(`DOCX import requires a single .docx file (not a directory): ${input}`); |
| 64 | } |
| 65 | if (!absInput.toLowerCase().endsWith('.docx')) { |
| 66 | throw new Error(`DOCX import requires a .docx file; got: ${path.basename(absInput)}`); |
| 67 | } |
| 68 | |
| 69 | const buf = fs.readFileSync(absInput); |
| 70 | const source_id = sourceIdFromDocxBytes(buf); |
| 71 | const short = source_id.slice(0, 12); |
| 72 | const outputRel = path.join(outputBase, 'imports', 'docx', `${short}.md`).replace(/\\/g, '/'); |
| 73 | |
| 74 | let result; |
| 75 | try { |
| 76 | result = await mammoth.convertToMarkdown({ buffer: buf }); |
| 77 | } catch (e) { |
| 78 | const msg = e && typeof e.message === 'string' ? e.message : String(e); |
| 79 | throw new Error(`Could not read this DOCX (corrupt or not a Word document): ${msg}`); |
| 80 | } |
| 81 | |
| 82 | const bodyMd = normalizeMarkdownBody(result.value); |
| 83 | if (!bodyMd) { |
| 84 | throw new Error('Could not convert this DOCX to usable text (empty document)'); |
| 85 | } |
| 86 | |
| 87 | const now = new Date().toISOString().slice(0, 10); |
| 88 | const baseName = path.basename(absInput); |
| 89 | const title = titleFromDocxFilename(absInput); |
| 90 | |
| 91 | let body = |
| 92 | bodyMd + |
| 93 | '\n\n---\n\n' + |
| 94 | `_Imported from DOCX:_ \`${baseName}\`.\n`; |
| 95 | |
| 96 | if (result.messages && result.messages.length > 0) { |
| 97 | const lines = result.messages |
| 98 | .map((m) => (m && typeof m.message === 'string' ? m.message.trim() : '')) |
| 99 | .filter(Boolean); |
| 100 | if (lines.length) { |
| 101 | body += '\n_Conversion notes:_\n\n' + lines.map((l) => `- ${l}`).join('\n') + '\n'; |
| 102 | } |
| 103 | } |
| 104 | |
| 105 | const merged = { |
| 106 | title, |
| 107 | date: now, |
| 108 | source: 'docx-import', |
| 109 | source_id, |
| 110 | docx_file: baseName, |
| 111 | ...(project && { project: normalizeSlug(project) }), |
| 112 | ...(tags.length && { tags }), |
| 113 | }; |
| 114 | if (typeof merged.tags === 'string') merged.tags = tags; |
| 115 | else if (Array.isArray(merged.tags)) merged.tags = [...new Set([...merged.tags, ...tags])]; |
| 116 | else merged.tags = tags; |
| 117 | |
| 118 | if (!dryRun) { |
| 119 | writeNote(vaultPath, outputRel, { |
| 120 | body, |
| 121 | frontmatter: Object.fromEntries(Object.entries(merged).filter(([, v]) => v !== undefined && v !== null && v !== '')), |
| 122 | }); |
| 123 | } |
| 124 | |
| 125 | if (onProgress) await onProgress({ progress: 1, total: 1, message: 'Done' }); |
| 126 | |
| 127 | return { imported: [{ path: outputRel, source_id }], count: 1 }; |
| 128 | } |
File History
2 commits
sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd
feat(calendar): enforce agent context tiers in retrieval AP…
Human
minor
⚠
1 day ago
sha256:9103f98c89257ed2b01c237cea895dabb3e85ea337dccb1161c175e4422355b6
docs: accept Calendar Events v0 spec with Phase 0 security …
Human
1 day ago