lib/chunk.mjs · aaronrene/knowtation — MuseHub

aaronrene / knowtation public

chunk.mjs

97 lines 3.4 KB

Raw

sha256:6a102aafafdfe7e70a24f4e59740200f0ee713ce7915f1b53e9d4ba5ee8b4410 Initial Muse snapshot Human 47 days ago

1	/**
2	* Chunk Markdown for indexing. Split by heading or fixed size; configurable overlap.
3	* SPEC §5: path, project, tags, date on each chunk; stable chunk id for upsert.
4	*/
5
6	/**
7	* Default chunk size in characters (~512 tokens at ~4 chars/token). Overlap in chars.
8	*/
9	const DEFAULT_CHUNK_SIZE = 2048;
10	const DEFAULT_CHUNK_OVERLAP = 256;
11
12	/**
13	* Split text into chunks by heading (## or ###) first, then by size with overlap.
14	* @param {string} text - Markdown body (no frontmatter)
15	* @param {{ chunkSize?: number, chunkOverlap?: number }} options
16	* @returns {string[]} Chunk texts
17	*/
18	function splitByHeadingOrSize(text, options = {}) {
19	const chunkSize = options.chunkSize ?? DEFAULT_CHUNK_SIZE;
20	const chunkOverlap = options.chunkOverlap ?? DEFAULT_CHUNK_OVERLAP;
21	const trimmed = text.trim();
22	if (!trimmed) return [];
23
24	// Split by ## or ### (headings)
25	const sections = trimmed.split(/(?=^#{2,3}\s)/m).map((s) => s.trim()).filter(Boolean);
26	const out = [];
27
28	for (const section of sections) {
29	if (section.length <= chunkSize) {
30	out.push(section);
31	} else {
32	// Fixed-size split with overlap
33	let start = 0;
34	while (start < section.length) {
35	let end = start + chunkSize;
36	if (end < section.length) {
37	// Try to break at sentence or newline
38	const slice = section.slice(start, end);
39	const lastBreak = Math.max(slice.lastIndexOf('\n\n'), slice.lastIndexOf('. '), slice.lastIndexOf(' '));
40	if (lastBreak > chunkSize / 2) {
41	end = start + lastBreak + 1;
42	}
43	} else {
44	end = section.length;
45	}
46	out.push(section.slice(start, end).trim());
47	if (end >= section.length) break;
48	start = end - chunkOverlap;
49	if (start < 0) start = 0;
50	}
51	}
52	}
53
54	return out.filter(Boolean);
55	}
56
57	/**
58	* Build chunks for one note. Each chunk has text and metadata (path, project, tags, date; optional causal_chain_id, entity, episode_id).
59	* @param {{ body: string, path: string, project?: string, tags?: string[], date?: string, causal_chain_id?: string, entity?: string[], episode_id?: string }} note - From vault.readNote
60	* @param {{ chunkSize?: number, chunkOverlap?: number }} options
61	* @returns {{ id: string, text: string, path: string, project?: string, tags: string[], date?: string, causal_chain_id?: string, entity: string[], episode_id?: string }[]}
62	*/
63	export function chunkNote(note, options = {}) {
64	const chunks = splitByHeadingOrSize(note.body, options);
65	const path = note.path.replace(/\\/g, '/');
66	const project = note.project \|\| undefined;
67	const tags = Array.isArray(note.tags) ? note.tags : [];
68	const date = note.date \|\| undefined;
69	const causal_chain_id = note.causal_chain_id \|\| undefined;
70	const entity = Array.isArray(note.entity) ? note.entity : [];
71	const episode_id = note.episode_id \|\| undefined;
72
73	return chunks.map((text, index) => ({
74	id: stableChunkId(path, index),
75	text,
76	path,
77	project,
78	tags,
79	date,
80	causal_chain_id,
81	entity,
82	episode_id,
83	}));
84	}
85
86	/**
87	* Stable chunk id for upsert (no duplicates on re-run). SPEC §5.
88	* @param {string} vaultRelativePath
89	* @param {number} index
90	* @returns {string}
91	*/
92	export function stableChunkId(vaultRelativePath, index) {
93	const safe = vaultRelativePath.replace(/\\/g, '/').replace(/[^a-zA-Z0-9/._-]/g, '_');
94	return `${safe}_${index}`;
95	}
96
97	export { DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP, splitByHeadingOrSize };

File History 1 commit

sha256:6a102aafafdfe7e70a24f4e59740200f0ee713ce7915f1b53e9d4ba5ee8b4410 Initial Muse snapshot Human 47 days ago