chunk.mjs
97 lines 3.4 KB
Raw
1 /**
2 * Chunk Markdown for indexing. Split by heading or fixed size; configurable overlap.
3 * SPEC §5: path, project, tags, date on each chunk; stable chunk id for upsert.
4 */
5
6 /**
7 * Default chunk size in characters (~512 tokens at ~4 chars/token). Overlap in chars.
8 */
9 const DEFAULT_CHUNK_SIZE = 2048;
10 const DEFAULT_CHUNK_OVERLAP = 256;
11
12 /**
13 * Split text into chunks by heading (## or ###) first, then by size with overlap.
14 * @param {string} text - Markdown body (no frontmatter)
15 * @param {{ chunkSize?: number, chunkOverlap?: number }} options
16 * @returns {string[]} Chunk texts
17 */
18 function splitByHeadingOrSize(text, options = {}) {
19 const chunkSize = options.chunkSize ?? DEFAULT_CHUNK_SIZE;
20 const chunkOverlap = options.chunkOverlap ?? DEFAULT_CHUNK_OVERLAP;
21 const trimmed = text.trim();
22 if (!trimmed) return [];
23
24 // Split by ## or ### (headings)
25 const sections = trimmed.split(/(?=^#{2,3}\s)/m).map((s) => s.trim()).filter(Boolean);
26 const out = [];
27
28 for (const section of sections) {
29 if (section.length <= chunkSize) {
30 out.push(section);
31 } else {
32 // Fixed-size split with overlap
33 let start = 0;
34 while (start < section.length) {
35 let end = start + chunkSize;
36 if (end < section.length) {
37 // Try to break at sentence or newline
38 const slice = section.slice(start, end);
39 const lastBreak = Math.max(slice.lastIndexOf('\n\n'), slice.lastIndexOf('. '), slice.lastIndexOf(' '));
40 if (lastBreak > chunkSize / 2) {
41 end = start + lastBreak + 1;
42 }
43 } else {
44 end = section.length;
45 }
46 out.push(section.slice(start, end).trim());
47 if (end >= section.length) break;
48 start = end - chunkOverlap;
49 if (start < 0) start = 0;
50 }
51 }
52 }
53
54 return out.filter(Boolean);
55 }
56
57 /**
58 * Build chunks for one note. Each chunk has text and metadata (path, project, tags, date; optional causal_chain_id, entity, episode_id).
59 * @param {{ body: string, path: string, project?: string, tags?: string[], date?: string, causal_chain_id?: string, entity?: string[], episode_id?: string }} note - From vault.readNote
60 * @param {{ chunkSize?: number, chunkOverlap?: number }} options
61 * @returns {{ id: string, text: string, path: string, project?: string, tags: string[], date?: string, causal_chain_id?: string, entity: string[], episode_id?: string }[]}
62 */
63 export function chunkNote(note, options = {}) {
64 const chunks = splitByHeadingOrSize(note.body, options);
65 const path = note.path.replace(/\\/g, '/');
66 const project = note.project || undefined;
67 const tags = Array.isArray(note.tags) ? note.tags : [];
68 const date = note.date || undefined;
69 const causal_chain_id = note.causal_chain_id || undefined;
70 const entity = Array.isArray(note.entity) ? note.entity : [];
71 const episode_id = note.episode_id || undefined;
72
73 return chunks.map((text, index) => ({
74 id: stableChunkId(path, index),
75 text,
76 path,
77 project,
78 tags,
79 date,
80 causal_chain_id,
81 entity,
82 episode_id,
83 }));
84 }
85
86 /**
87 * Stable chunk id for upsert (no duplicates on re-run). SPEC §5.
88 * @param {string} vaultRelativePath
89 * @param {number} index
90 * @returns {string}
91 */
92 export function stableChunkId(vaultRelativePath, index) {
93 const safe = vaultRelativePath.replace(/\\/g, '/').replace(/[^a-zA-Z0-9/._-]/g, '_');
94 return `${safe}_${index}`;
95 }
96
97 export { DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP, splitByHeadingOrSize };
File History 1 commit