chunk.mjs
sha256:6a102aafafdfe7e70a24f4e59740200f0ee713ce7915f1b53e9d4ba5ee8b4410
Initial Muse snapshot
Human
47 days ago
| 1 | /** |
| 2 | * Chunk Markdown for indexing. Split by heading or fixed size; configurable overlap. |
| 3 | * SPEC §5: path, project, tags, date on each chunk; stable chunk id for upsert. |
| 4 | */ |
| 5 | |
| 6 | /** |
| 7 | * Default chunk size in characters (~512 tokens at ~4 chars/token). Overlap in chars. |
| 8 | */ |
| 9 | const DEFAULT_CHUNK_SIZE = 2048; |
| 10 | const DEFAULT_CHUNK_OVERLAP = 256; |
| 11 | |
| 12 | /** |
| 13 | * Split text into chunks by heading (## or ###) first, then by size with overlap. |
| 14 | * @param {string} text - Markdown body (no frontmatter) |
| 15 | * @param {{ chunkSize?: number, chunkOverlap?: number }} options |
| 16 | * @returns {string[]} Chunk texts |
| 17 | */ |
| 18 | function splitByHeadingOrSize(text, options = {}) { |
| 19 | const chunkSize = options.chunkSize ?? DEFAULT_CHUNK_SIZE; |
| 20 | const chunkOverlap = options.chunkOverlap ?? DEFAULT_CHUNK_OVERLAP; |
| 21 | const trimmed = text.trim(); |
| 22 | if (!trimmed) return []; |
| 23 | |
| 24 | // Split by ## or ### (headings) |
| 25 | const sections = trimmed.split(/(?=^#{2,3}\s)/m).map((s) => s.trim()).filter(Boolean); |
| 26 | const out = []; |
| 27 | |
| 28 | for (const section of sections) { |
| 29 | if (section.length <= chunkSize) { |
| 30 | out.push(section); |
| 31 | } else { |
| 32 | // Fixed-size split with overlap |
| 33 | let start = 0; |
| 34 | while (start < section.length) { |
| 35 | let end = start + chunkSize; |
| 36 | if (end < section.length) { |
| 37 | // Try to break at sentence or newline |
| 38 | const slice = section.slice(start, end); |
| 39 | const lastBreak = Math.max(slice.lastIndexOf('\n\n'), slice.lastIndexOf('. '), slice.lastIndexOf(' ')); |
| 40 | if (lastBreak > chunkSize / 2) { |
| 41 | end = start + lastBreak + 1; |
| 42 | } |
| 43 | } else { |
| 44 | end = section.length; |
| 45 | } |
| 46 | out.push(section.slice(start, end).trim()); |
| 47 | if (end >= section.length) break; |
| 48 | start = end - chunkOverlap; |
| 49 | if (start < 0) start = 0; |
| 50 | } |
| 51 | } |
| 52 | } |
| 53 | |
| 54 | return out.filter(Boolean); |
| 55 | } |
| 56 | |
| 57 | /** |
| 58 | * Build chunks for one note. Each chunk has text and metadata (path, project, tags, date; optional causal_chain_id, entity, episode_id). |
| 59 | * @param {{ body: string, path: string, project?: string, tags?: string[], date?: string, causal_chain_id?: string, entity?: string[], episode_id?: string }} note - From vault.readNote |
| 60 | * @param {{ chunkSize?: number, chunkOverlap?: number }} options |
| 61 | * @returns {{ id: string, text: string, path: string, project?: string, tags: string[], date?: string, causal_chain_id?: string, entity: string[], episode_id?: string }[]} |
| 62 | */ |
| 63 | export function chunkNote(note, options = {}) { |
| 64 | const chunks = splitByHeadingOrSize(note.body, options); |
| 65 | const path = note.path.replace(/\\/g, '/'); |
| 66 | const project = note.project || undefined; |
| 67 | const tags = Array.isArray(note.tags) ? note.tags : []; |
| 68 | const date = note.date || undefined; |
| 69 | const causal_chain_id = note.causal_chain_id || undefined; |
| 70 | const entity = Array.isArray(note.entity) ? note.entity : []; |
| 71 | const episode_id = note.episode_id || undefined; |
| 72 | |
| 73 | return chunks.map((text, index) => ({ |
| 74 | id: stableChunkId(path, index), |
| 75 | text, |
| 76 | path, |
| 77 | project, |
| 78 | tags, |
| 79 | date, |
| 80 | causal_chain_id, |
| 81 | entity, |
| 82 | episode_id, |
| 83 | })); |
| 84 | } |
| 85 | |
| 86 | /** |
| 87 | * Stable chunk id for upsert (no duplicates on re-run). SPEC §5. |
| 88 | * @param {string} vaultRelativePath |
| 89 | * @param {number} index |
| 90 | * @returns {string} |
| 91 | */ |
| 92 | export function stableChunkId(vaultRelativePath, index) { |
| 93 | const safe = vaultRelativePath.replace(/\\/g, '/').replace(/[^a-zA-Z0-9/._-]/g, '_'); |
| 94 | return `${safe}_${index}`; |
| 95 | } |
| 96 | |
| 97 | export { DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP, splitByHeadingOrSize }; |
File History
1 commit
sha256:6a102aafafdfe7e70a24f4e59740200f0ee713ce7915f1b53e9d4ba5ee8b4410
Initial Muse snapshot
Human
47 days ago