/** * Chunk Markdown for indexing. Split by heading or fixed size; configurable overlap. * SPEC §5: path, project, tags, date on each chunk; stable chunk id for upsert. */ /** * Default chunk size in characters (~512 tokens at ~4 chars/token). Overlap in chars. */ const DEFAULT_CHUNK_SIZE = 2048; const DEFAULT_CHUNK_OVERLAP = 256; /** * Split text into chunks by heading (## or ###) first, then by size with overlap. * @param {string} text - Markdown body (no frontmatter) * @param {{ chunkSize?: number, chunkOverlap?: number }} options * @returns {string[]} Chunk texts */ function splitByHeadingOrSize(text, options = {}) { const chunkSize = options.chunkSize ?? DEFAULT_CHUNK_SIZE; const chunkOverlap = options.chunkOverlap ?? DEFAULT_CHUNK_OVERLAP; const trimmed = text.trim(); if (!trimmed) return []; // Split by ## or ### (headings) const sections = trimmed.split(/(?=^#{2,3}\s)/m).map((s) => s.trim()).filter(Boolean); const out = []; for (const section of sections) { if (section.length <= chunkSize) { out.push(section); } else { // Fixed-size split with overlap let start = 0; while (start < section.length) { let end = start + chunkSize; if (end < section.length) { // Try to break at sentence or newline const slice = section.slice(start, end); const lastBreak = Math.max(slice.lastIndexOf('\n\n'), slice.lastIndexOf('. '), slice.lastIndexOf(' ')); if (lastBreak > chunkSize / 2) { end = start + lastBreak + 1; } } else { end = section.length; } out.push(section.slice(start, end).trim()); if (end >= section.length) break; start = end - chunkOverlap; if (start < 0) start = 0; } } } return out.filter(Boolean); } /** * Build chunks for one note. Each chunk has text and metadata (path, project, tags, date; optional causal_chain_id, entity, episode_id). * @param {{ body: string, path: string, project?: string, tags?: string[], date?: string, causal_chain_id?: string, entity?: string[], episode_id?: string }} note - From vault.readNote * @param {{ chunkSize?: number, chunkOverlap?: number }} options * @returns {{ id: string, text: string, path: string, project?: string, tags: string[], date?: string, causal_chain_id?: string, entity: string[], episode_id?: string }[]} */ export function chunkNote(note, options = {}) { const chunks = splitByHeadingOrSize(note.body, options); const path = note.path.replace(/\\/g, '/'); const project = note.project || undefined; const tags = Array.isArray(note.tags) ? note.tags : []; const date = note.date || undefined; const causal_chain_id = note.causal_chain_id || undefined; const entity = Array.isArray(note.entity) ? note.entity : []; const episode_id = note.episode_id || undefined; return chunks.map((text, index) => ({ id: stableChunkId(path, index), text, path, project, tags, date, causal_chain_id, entity, episode_id, })); } /** * Stable chunk id for upsert (no duplicates on re-run). SPEC §5. * @param {string} vaultRelativePath * @param {number} index * @returns {string} */ export function stableChunkId(vaultRelativePath, index) { const safe = vaultRelativePath.replace(/\\/g, '/').replace(/[^a-zA-Z0-9/._-]/g, '_'); return `${safe}_${index}`; } export { DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP, splitByHeadingOrSize };