lib/section-source.mjs · aaronrene/knowtation — MuseHub

aaronrene / knowtation public

section-source.mjs

302 lines 9.2 KB

Raw

sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd feat(calendar): enforce agent context tiers in retrieval AP… Human minor ⚠ breaking 1 day ago

1	/**
2	* Body-free section source metadata for one Markdown note.
3	*
4	* This module is intentionally pure: no file reads, writes, CLI, MCP, hosted
5	* transport, Hub, search, indexes, vectors, memory, summaries, PageIndex, OCR,
6	* LLM calls, provider routing, snippets, or section body output.
7	*/
8
9	import { unified } from 'unified';
10	import remarkParse from 'remark-parse';
11	import { buildDocumentTreeFromOutline } from './document-tree.mjs';
12	import { buildNoteOutline, buildNoteOutlineFromMarkdown } from './note-outline.mjs';
13	import { normalizeSlug, parseFrontmatterAndBody } from './vault.mjs';
14
15	export const SECTION_SOURCE_SCHEMA = 'knowtation.section_source/v0';
16
17	const parser = unified().use(remarkParse);
18
19	/**
20	* Build body-free SectionSource metadata from a raw Markdown file string.
21	* @param {string} notePath
22	* @param {string} markdown
23	* @param {{ maxInputChars?: number, maxHeadings?: number }} [options]
24	* @returns {{
25	* schema: string,
26	* path: string,
27	* title: string\|null,
28	* sections: {
29	* section_id: string,
30	* heading_id: string,
31	* level: number,
32	* heading_path: string[],
33	* heading_text: string,
34	* child_section_ids: string[],
35	* body_available: boolean,
36	* body_returned: false,
37	* snippet_returned: false
38	* }[],
39	* truncated: boolean
40	* }}
41	*/
42	export function buildSectionSourceFromMarkdown(notePath, markdown, options = {}) {
43	if (typeof markdown !== 'string') {
44	throw new TypeError('buildSectionSourceFromMarkdown: markdown must be a string');
45	}
46	const { frontmatter, body } = parseFrontmatterAndBody(markdown);
47	return buildSectionSource({ path: notePath, frontmatter, body }, options);
48	}
49
50	/**
51	* Build body-free SectionSource metadata from a parsed vault note.
52	* @param {{ path: string, frontmatter?: Record<string, unknown>, body: string }} note
53	* @param {{ maxInputChars?: number, maxHeadings?: number }} [options]
54	* @returns {{
55	* schema: string,
56	* path: string,
57	* title: string\|null,
58	* sections: {
59	* section_id: string,
60	* heading_id: string,
61	* level: number,
62	* heading_path: string[],
63	* heading_text: string,
64	* child_section_ids: string[],
65	* body_available: boolean,
66	* body_returned: false,
67	* snippet_returned: false
68	* }[],
69	* truncated: boolean
70	* }}
71	*/
72	export function buildSectionSource(note, options = {}) {
73	if (note == null \|\| typeof note !== 'object') {
74	throw new TypeError('buildSectionSource: note is required');
75	}
76	if (typeof note.body !== 'string') {
77	throw new TypeError('buildSectionSource: note.body must be a string');
78	}
79
80	const outline = buildNoteOutline(note, options);
81	return buildSectionSourceFromOutlineAndBody(outline, note.body, options);
82	}
83
84	/**
85	* Build SectionSource metadata from a NoteOutline-compatible object plus the
86	* current Markdown body. The body is parsed only to derive body availability and
87	* is never returned.
88	* @param {{ path: string, title: string\|null, headings: { level: number, text: string, id: string }[], truncated: boolean }} outline
89	* @param {string} markdownBody
90	* @param {{ maxHeadings?: number }} [options]
91	* @returns {{
92	* schema: string,
93	* path: string,
94	* title: string\|null,
95	* sections: {
96	* section_id: string,
97	* heading_id: string,
98	* level: number,
99	* heading_path: string[],
100	* heading_text: string,
101	* child_section_ids: string[],
102	* body_available: boolean,
103	* body_returned: false,
104	* snippet_returned: false
105	* }[],
106	* truncated: boolean
107	* }}
108	*/
109	export function buildSectionSourceFromOutlineAndBody(outline, markdownBody, options = {}) {
110	if (typeof markdownBody !== 'string') {
111	throw new TypeError('buildSectionSourceFromOutlineAndBody: markdownBody must be a string');
112	}
113
114	const tree = buildDocumentTreeFromOutline(outline, options);
115	const bodyAvailability = bodyAvailabilityByHeadingId(markdownBody, tree.root.children);
116	const sectionIds = new Map();
117	const pathSlug = normalizeSlug(tree.path) \|\| 'note';
118
119	assignSectionIds(tree.root.children, pathSlug, sectionIds);
120
121	return {
122	schema: SECTION_SOURCE_SCHEMA,
123	path: tree.path,
124	title: tree.title,
125	sections: flattenSections(tree.root.children, [], sectionIds, bodyAvailability),
126	truncated: tree.truncated,
127	};
128	}
129
130	/**
131	* @param {{ id: string, children: unknown[] }[]} nodes
132	* @param {string} pathSlug
133	* @param {Map<string, string>} sectionIds
134	*/
135	function assignSectionIds(nodes, pathSlug, sectionIds) {
136	for (const node of nodes) {
137	const sectionId = `${pathSlug}:${node.id}`;
138	sectionIds.set(node.id, sectionId);
139	assignSectionIds(node.children, pathSlug, sectionIds);
140	}
141	}
142
143	/**
144	* @param {{ id: string, level: number, text: string, children: unknown[] }[]} nodes
145	* @param {string[]} ancestors
146	* @param {Map<string, string>} sectionIds
147	* @param {Map<string, boolean>} bodyAvailability
148	* @returns {{
149	* section_id: string,
150	* heading_id: string,
151	* level: number,
152	* heading_path: string[],
153	* heading_text: string,
154	* child_section_ids: string[],
155	* body_available: boolean,
156	* body_returned: false,
157	* snippet_returned: false
158	* }[]}
159	*/
160	function flattenSections(nodes, ancestors, sectionIds, bodyAvailability) {
161	const out = [];
162	for (const node of nodes) {
163	const headingPath = [...ancestors, node.text];
164	out.push({
165	section_id: sectionIds.get(node.id),
166	heading_id: node.id,
167	level: node.level,
168	heading_path: headingPath,
169	heading_text: node.text,
170	child_section_ids: node.children.map((child) => sectionIds.get(child.id)),
171	body_available: bodyAvailability.get(node.id) === true,
172	body_returned: false,
173	snippet_returned: false,
174	});
175	out.push(...flattenSections(node.children, headingPath, sectionIds, bodyAvailability));
176	}
177	return out;
178	}
179
180	/**
181	* @param {string} markdownBody
182	* @param {{ id: string }[]} visibleTreeNodes
183	* @returns {Map<string, boolean>}
184	*/
185	function bodyAvailabilityByHeadingId(markdownBody, visibleTreeNodes) {
186	const visibleHeadingIds = flattenTreeHeadingIds(visibleTreeNodes);
187	const ast = parser.parse(markdownBody);
188	const headingNodes = [];
189	collectHeadingNodes(ast, headingNodes);
190	const topLevelBlocks = Array.isArray(ast.children) ? ast.children : [];
191	const out = new Map();
192
193	visibleHeadingIds.forEach((headingId, visibleIndex) => {
194	const headingNode = headingNodes[visibleIndex];
195	if (!headingNode) {
196	out.set(headingId, false);
197	return;
198	}
199	out.set(headingId, hasBodyContentInBoundary(headingNode, headingNodes, visibleIndex, topLevelBlocks));
200	});
201
202	return out;
203	}
204
205	/**
206	* @param {{ id: string, children?: unknown[] }[]} nodes
207	* @returns {string[]}
208	*/
209	function flattenTreeHeadingIds(nodes) {
210	const out = [];
211	for (const node of nodes) {
212	out.push(node.id);
213	out.push(...flattenTreeHeadingIds(Array.isArray(node.children) ? node.children : []));
214	}
215	return out;
216	}
217
218	/**
219	* @param {unknown} node
220	* @param {unknown[]} out
221	*/
222	function collectHeadingNodes(node, out) {
223	if (node == null \|\| typeof node !== 'object') return;
224	if (node.type === 'heading') {
225	out.push(node);
226	return;
227	}
228	if (Array.isArray(node.children)) {
229	for (const child of node.children) {
230	collectHeadingNodes(child, out);
231	}
232	}
233	}
234
235	/**
236	* @param {unknown} headingNode
237	* @param {unknown[]} headingNodes
238	* @param {number} headingIndex
239	* @param {unknown[]} topLevelBlocks
240	* @returns {boolean}
241	*/
242	function hasBodyContentInBoundary(headingNode, headingNodes, headingIndex, topLevelBlocks) {
243	const startOffset = endOffset(headingNode);
244	const endBoundary = nextPeerOrAncestorStartOffset(headingNode, headingNodes, headingIndex);
245
246	return topLevelBlocks.some((node) => {
247	if (node == null \|\| typeof node !== 'object' \|\| node.type === 'heading') return false;
248	const nodeStart = startOffsetOf(node);
249	if (nodeStart == null \|\| nodeStart < startOffset \|\| nodeStart >= endBoundary) return false;
250	return hasSubstantiveContent(node);
251	});
252	}
253
254	/**
255	* @param {unknown} headingNode
256	* @param {unknown[]} headingNodes
257	* @param {number} headingIndex
258	* @returns {number}
259	*/
260	function nextPeerOrAncestorStartOffset(headingNode, headingNodes, headingIndex) {
261	const currentDepth = Number(headingNode.depth);
262	for (let index = headingIndex + 1; index < headingNodes.length; index += 1) {
263	const candidate = headingNodes[index];
264	if (Number(candidate.depth) <= currentDepth) {
265	const offset = startOffsetOf(candidate);
266	if (offset != null) return offset;
267	}
268	}
269	return Number.POSITIVE_INFINITY;
270	}
271
272	/**
273	* @param {unknown} node
274	* @returns {boolean}
275	*/
276	function hasSubstantiveContent(node) {
277	if (node == null \|\| typeof node !== 'object') return false;
278	if (typeof node.value === 'string' && node.value.trim() !== '') return true;
279	if (typeof node.alt === 'string' && node.alt.trim() !== '') return true;
280	if (Array.isArray(node.children)) {
281	return node.children.some((child) => hasSubstantiveContent(child));
282	}
283	return node.type === 'thematicBreak';
284	}
285
286	/**
287	* @param {unknown} node
288	* @returns {number\|null}
289	*/
290	function startOffsetOf(node) {
291	const offset = node?.position?.start?.offset;
292	return Number.isInteger(offset) ? offset : null;
293	}
294
295	/**
296	* @param {unknown} node
297	* @returns {number}
298	*/
299	function endOffset(node) {
300	const offset = node?.position?.end?.offset;
301	return Number.isInteger(offset) ? offset : 0;
302	}

File History 2 commits

sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd feat(calendar): enforce agent context tiers in retrieval AP… Human minor ⚠ 1 day ago

sha256:9103f98c89257ed2b01c237cea895dabb3e85ea337dccb1161c175e4422355b6 docs: accept Calendar Events v0 spec with Phase 0 security … Human 2 days ago