section-source.mjs
302 lines 9.2 KB
Raw
sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd feat(calendar): enforce agent context tiers in retrieval AP… Human minor ⚠ breaking 1 day ago
1 /**
2 * Body-free section source metadata for one Markdown note.
3 *
4 * This module is intentionally pure: no file reads, writes, CLI, MCP, hosted
5 * transport, Hub, search, indexes, vectors, memory, summaries, PageIndex, OCR,
6 * LLM calls, provider routing, snippets, or section body output.
7 */
8
9 import { unified } from 'unified';
10 import remarkParse from 'remark-parse';
11 import { buildDocumentTreeFromOutline } from './document-tree.mjs';
12 import { buildNoteOutline, buildNoteOutlineFromMarkdown } from './note-outline.mjs';
13 import { normalizeSlug, parseFrontmatterAndBody } from './vault.mjs';
14
15 export const SECTION_SOURCE_SCHEMA = 'knowtation.section_source/v0';
16
17 const parser = unified().use(remarkParse);
18
19 /**
20 * Build body-free SectionSource metadata from a raw Markdown file string.
21 * @param {string} notePath
22 * @param {string} markdown
23 * @param {{ maxInputChars?: number, maxHeadings?: number }} [options]
24 * @returns {{
25 * schema: string,
26 * path: string,
27 * title: string|null,
28 * sections: {
29 * section_id: string,
30 * heading_id: string,
31 * level: number,
32 * heading_path: string[],
33 * heading_text: string,
34 * child_section_ids: string[],
35 * body_available: boolean,
36 * body_returned: false,
37 * snippet_returned: false
38 * }[],
39 * truncated: boolean
40 * }}
41 */
42 export function buildSectionSourceFromMarkdown(notePath, markdown, options = {}) {
43 if (typeof markdown !== 'string') {
44 throw new TypeError('buildSectionSourceFromMarkdown: markdown must be a string');
45 }
46 const { frontmatter, body } = parseFrontmatterAndBody(markdown);
47 return buildSectionSource({ path: notePath, frontmatter, body }, options);
48 }
49
50 /**
51 * Build body-free SectionSource metadata from a parsed vault note.
52 * @param {{ path: string, frontmatter?: Record<string, unknown>, body: string }} note
53 * @param {{ maxInputChars?: number, maxHeadings?: number }} [options]
54 * @returns {{
55 * schema: string,
56 * path: string,
57 * title: string|null,
58 * sections: {
59 * section_id: string,
60 * heading_id: string,
61 * level: number,
62 * heading_path: string[],
63 * heading_text: string,
64 * child_section_ids: string[],
65 * body_available: boolean,
66 * body_returned: false,
67 * snippet_returned: false
68 * }[],
69 * truncated: boolean
70 * }}
71 */
72 export function buildSectionSource(note, options = {}) {
73 if (note == null || typeof note !== 'object') {
74 throw new TypeError('buildSectionSource: note is required');
75 }
76 if (typeof note.body !== 'string') {
77 throw new TypeError('buildSectionSource: note.body must be a string');
78 }
79
80 const outline = buildNoteOutline(note, options);
81 return buildSectionSourceFromOutlineAndBody(outline, note.body, options);
82 }
83
84 /**
85 * Build SectionSource metadata from a NoteOutline-compatible object plus the
86 * current Markdown body. The body is parsed only to derive body availability and
87 * is never returned.
88 * @param {{ path: string, title: string|null, headings: { level: number, text: string, id: string }[], truncated: boolean }} outline
89 * @param {string} markdownBody
90 * @param {{ maxHeadings?: number }} [options]
91 * @returns {{
92 * schema: string,
93 * path: string,
94 * title: string|null,
95 * sections: {
96 * section_id: string,
97 * heading_id: string,
98 * level: number,
99 * heading_path: string[],
100 * heading_text: string,
101 * child_section_ids: string[],
102 * body_available: boolean,
103 * body_returned: false,
104 * snippet_returned: false
105 * }[],
106 * truncated: boolean
107 * }}
108 */
109 export function buildSectionSourceFromOutlineAndBody(outline, markdownBody, options = {}) {
110 if (typeof markdownBody !== 'string') {
111 throw new TypeError('buildSectionSourceFromOutlineAndBody: markdownBody must be a string');
112 }
113
114 const tree = buildDocumentTreeFromOutline(outline, options);
115 const bodyAvailability = bodyAvailabilityByHeadingId(markdownBody, tree.root.children);
116 const sectionIds = new Map();
117 const pathSlug = normalizeSlug(tree.path) || 'note';
118
119 assignSectionIds(tree.root.children, pathSlug, sectionIds);
120
121 return {
122 schema: SECTION_SOURCE_SCHEMA,
123 path: tree.path,
124 title: tree.title,
125 sections: flattenSections(tree.root.children, [], sectionIds, bodyAvailability),
126 truncated: tree.truncated,
127 };
128 }
129
130 /**
131 * @param {{ id: string, children: unknown[] }[]} nodes
132 * @param {string} pathSlug
133 * @param {Map<string, string>} sectionIds
134 */
135 function assignSectionIds(nodes, pathSlug, sectionIds) {
136 for (const node of nodes) {
137 const sectionId = `${pathSlug}:${node.id}`;
138 sectionIds.set(node.id, sectionId);
139 assignSectionIds(node.children, pathSlug, sectionIds);
140 }
141 }
142
143 /**
144 * @param {{ id: string, level: number, text: string, children: unknown[] }[]} nodes
145 * @param {string[]} ancestors
146 * @param {Map<string, string>} sectionIds
147 * @param {Map<string, boolean>} bodyAvailability
148 * @returns {{
149 * section_id: string,
150 * heading_id: string,
151 * level: number,
152 * heading_path: string[],
153 * heading_text: string,
154 * child_section_ids: string[],
155 * body_available: boolean,
156 * body_returned: false,
157 * snippet_returned: false
158 * }[]}
159 */
160 function flattenSections(nodes, ancestors, sectionIds, bodyAvailability) {
161 const out = [];
162 for (const node of nodes) {
163 const headingPath = [...ancestors, node.text];
164 out.push({
165 section_id: sectionIds.get(node.id),
166 heading_id: node.id,
167 level: node.level,
168 heading_path: headingPath,
169 heading_text: node.text,
170 child_section_ids: node.children.map((child) => sectionIds.get(child.id)),
171 body_available: bodyAvailability.get(node.id) === true,
172 body_returned: false,
173 snippet_returned: false,
174 });
175 out.push(...flattenSections(node.children, headingPath, sectionIds, bodyAvailability));
176 }
177 return out;
178 }
179
180 /**
181 * @param {string} markdownBody
182 * @param {{ id: string }[]} visibleTreeNodes
183 * @returns {Map<string, boolean>}
184 */
185 function bodyAvailabilityByHeadingId(markdownBody, visibleTreeNodes) {
186 const visibleHeadingIds = flattenTreeHeadingIds(visibleTreeNodes);
187 const ast = parser.parse(markdownBody);
188 const headingNodes = [];
189 collectHeadingNodes(ast, headingNodes);
190 const topLevelBlocks = Array.isArray(ast.children) ? ast.children : [];
191 const out = new Map();
192
193 visibleHeadingIds.forEach((headingId, visibleIndex) => {
194 const headingNode = headingNodes[visibleIndex];
195 if (!headingNode) {
196 out.set(headingId, false);
197 return;
198 }
199 out.set(headingId, hasBodyContentInBoundary(headingNode, headingNodes, visibleIndex, topLevelBlocks));
200 });
201
202 return out;
203 }
204
205 /**
206 * @param {{ id: string, children?: unknown[] }[]} nodes
207 * @returns {string[]}
208 */
209 function flattenTreeHeadingIds(nodes) {
210 const out = [];
211 for (const node of nodes) {
212 out.push(node.id);
213 out.push(...flattenTreeHeadingIds(Array.isArray(node.children) ? node.children : []));
214 }
215 return out;
216 }
217
218 /**
219 * @param {unknown} node
220 * @param {unknown[]} out
221 */
222 function collectHeadingNodes(node, out) {
223 if (node == null || typeof node !== 'object') return;
224 if (node.type === 'heading') {
225 out.push(node);
226 return;
227 }
228 if (Array.isArray(node.children)) {
229 for (const child of node.children) {
230 collectHeadingNodes(child, out);
231 }
232 }
233 }
234
235 /**
236 * @param {unknown} headingNode
237 * @param {unknown[]} headingNodes
238 * @param {number} headingIndex
239 * @param {unknown[]} topLevelBlocks
240 * @returns {boolean}
241 */
242 function hasBodyContentInBoundary(headingNode, headingNodes, headingIndex, topLevelBlocks) {
243 const startOffset = endOffset(headingNode);
244 const endBoundary = nextPeerOrAncestorStartOffset(headingNode, headingNodes, headingIndex);
245
246 return topLevelBlocks.some((node) => {
247 if (node == null || typeof node !== 'object' || node.type === 'heading') return false;
248 const nodeStart = startOffsetOf(node);
249 if (nodeStart == null || nodeStart < startOffset || nodeStart >= endBoundary) return false;
250 return hasSubstantiveContent(node);
251 });
252 }
253
254 /**
255 * @param {unknown} headingNode
256 * @param {unknown[]} headingNodes
257 * @param {number} headingIndex
258 * @returns {number}
259 */
260 function nextPeerOrAncestorStartOffset(headingNode, headingNodes, headingIndex) {
261 const currentDepth = Number(headingNode.depth);
262 for (let index = headingIndex + 1; index < headingNodes.length; index += 1) {
263 const candidate = headingNodes[index];
264 if (Number(candidate.depth) <= currentDepth) {
265 const offset = startOffsetOf(candidate);
266 if (offset != null) return offset;
267 }
268 }
269 return Number.POSITIVE_INFINITY;
270 }
271
272 /**
273 * @param {unknown} node
274 * @returns {boolean}
275 */
276 function hasSubstantiveContent(node) {
277 if (node == null || typeof node !== 'object') return false;
278 if (typeof node.value === 'string' && node.value.trim() !== '') return true;
279 if (typeof node.alt === 'string' && node.alt.trim() !== '') return true;
280 if (Array.isArray(node.children)) {
281 return node.children.some((child) => hasSubstantiveContent(child));
282 }
283 return node.type === 'thematicBreak';
284 }
285
286 /**
287 * @param {unknown} node
288 * @returns {number|null}
289 */
290 function startOffsetOf(node) {
291 const offset = node?.position?.start?.offset;
292 return Number.isInteger(offset) ? offset : null;
293 }
294
295 /**
296 * @param {unknown} node
297 * @returns {number}
298 */
299 function endOffset(node) {
300 const offset = node?.position?.end?.offset;
301 return Number.isInteger(offset) ? offset : 0;
302 }
File History 2 commits
sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd feat(calendar): enforce agent context tiers in retrieval AP… Human minor 1 day ago
sha256:9103f98c89257ed2b01c237cea895dabb3e85ea337dccb1161c175e4422355b6 docs: accept Calendar Events v0 spec with Phase 0 security … Human 2 days ago