proposal-enrich-llm.mjs
301 lines 10.5 KB
Raw
sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd feat(calendar): enforce agent context tiers in retrieval AP… Human minor ⚠ breaking 2 days ago
1 /**
2 * Shared proposal Enrich: LLM prompt, parse, validate/normalize (SPEC-aligned metadata).
3 * Used by self-hosted hub/server.mjs and hub/gateway/proposal-enrich-hosted.mjs.
4 */
5
6 import { normalizeSlug, normalizeTags } from './vault.mjs';
7
8 export const ENRICH_VERSION = 2;
9
10 /** Keys the model may place inside suggested_frontmatter (SPEC §2.1 + §2.3). */
11 export const SUGGESTED_FRONTMATTER_KEYS = new Set([
12 'title',
13 'project',
14 'tags',
15 'date',
16 'updated',
17 'source',
18 'source_id',
19 'intent',
20 'follows',
21 'causal_chain_id',
22 'entity',
23 'episode_id',
24 'summarizes',
25 'summarizes_range',
26 'state_snapshot',
27 ]);
28
29 const FORBIDDEN_KEY_PREFIXES = ['knowtation_'];
30 const FORBIDDEN_KEYS = new Set([
31 'author_kind',
32 'network',
33 'wallet_address',
34 'tx_hash',
35 'payment_status',
36 'kind', // approval_log etc.
37 ]);
38
39 const MAX_SUMMARY_CHARS = 8000;
40 const MAX_LABELS = 8;
41 const MAX_LABEL_LEN = 64;
42 const MAX_SCALAR_CHARS = 512;
43 const MAX_TITLE_CHARS = 500;
44 const MAX_INTENT_CHARS = 2000;
45 const MAX_PATH_SEGMENTS = 20;
46 const MAX_JSON_OUTPUT_CHARS = 14000;
47
48 function isForbiddenKey(k) {
49 if (typeof k !== 'string' || !k) return true;
50 const lower = k.toLowerCase();
51 if (FORBIDDEN_KEYS.has(lower)) return true;
52 for (const p of FORBIDDEN_KEY_PREFIXES) {
53 if (lower.startsWith(p)) return true;
54 }
55 return false;
56 }
57
58 /**
59 * Vault-relative path segment check: no escape, no null bytes.
60 * @param {string} s
61 */
62 function isSafeVaultPathLike(s) {
63 if (typeof s !== 'string') return false;
64 const t = s.trim().replace(/\\/g, '/');
65 if (!t || t.includes('\0')) return false;
66 if (t.startsWith('/') || t.includes('..')) return false;
67 const parts = t.split('/').filter(Boolean);
68 if (parts.length > MAX_PATH_SEGMENTS) return false;
69 for (const seg of parts) {
70 if (seg === '..' || seg === '.') return false;
71 }
72 return true;
73 }
74
75 function clampStr(s, max) {
76 if (typeof s !== 'string') return '';
77 const t = s.trim();
78 return t.length <= max ? t : t.slice(0, max);
79 }
80
81 /**
82 * @param {{ path?: string, intent?: string, body?: string }} input
83 * @param {{ bodyMaxChars?: number }} [opts]
84 * @returns {{ system: string, user: string }}
85 */
86 export function buildEnrichMessages(input, opts = {}) {
87 const bodyMax = opts.bodyMaxChars ?? 12_000;
88 const path = input.path != null ? String(input.path) : '';
89 const intent = input.intent != null ? String(input.intent) : '—';
90 const body = input.body != null ? String(input.body).slice(0, bodyMax) : '';
91 const keyList = [...SUGGESTED_FRONTMATTER_KEYS].sort().join(', ');
92 const system = `Reply with ONLY valid JSON (no markdown fences). Schema:
93 {
94 "enrich_version": ${ENRICH_VERSION},
95 "summary": "one short paragraph describing the proposed change",
96 "suggested_labels": ["short-tag", ...],
97 "suggested_frontmatter": { ... optional; only keys from this allow-list: ${keyList} }
98 }
99 Rules:
100 - suggested_labels: at most ${MAX_LABELS} strings; lowercase slugs (a-z, 0-9, hyphen).
101 - suggested_frontmatter: only use keys from the allow-list. Prefer returning every field that is clearly grounded in the note path, intent, headings, body text, or explicit references. Omit only fields that are genuinely unsupported by the content.
102 - For project, causal_chain_id, episode_id, entity (if string), tags: use slug form (lowercase, hyphens).
103 - entity may be a string or array of strings (each normalized as slug).
104 - follows and summarizes may be a vault-relative path string or array of such paths (e.g. inbox/note.md).
105 - state_snapshot must be boolean if present.
106 - Do NOT include knowtation_* keys, author_kind, or blockchain fields (network, wallet_address, tx_hash, payment_status).`;
107 const user = `Path: ${path}
108 Intent: ${intent}
109
110 Extract metadata from the actual content, not just from any existing frontmatter-like wording.
111
112 Field guidance:
113 - title: infer a strong note title from the heading/body when possible.
114 - project: infer from the note path, named initiative, or clearly repeated project framing.
115 - tags: include concrete topic tags from the body, not generic filler.
116 - date / updated: include only if the document states a concrete date or time anchor.
117 - source / source_id: include only if the text names a source system, document, URL, export id, ticket id, etc.
118 - intent: infer the operational purpose if the body makes it clearer than the provided intent.
119 - follows: include prior note paths only when the body/path clearly references an earlier note or continuation.
120 - causal_chain_id: infer when the document describes a named chain of events, incident thread, rollout, investigation, or dependency chain.
121 - entity: extract the main people, teams, companies, products, systems, repos, or domains discussed.
122 - episode_id: infer when this note belongs to a clearly bounded episode, milestone, launch, sprint, outage, or test run.
123 - summarizes / summarizes_range: infer only when the note explicitly summarizes another note, folder, period, or range.
124 - state_snapshot: true only when the note is clearly capturing status/state at a moment in time.
125
126 Prioritize temporal, causal, entity, and relationship metadata when the content supports it. Do not stop at title/project/tags if richer grounded metadata is present.
127 ---
128 ${body}`;
129 return { system, user };
130 }
131
132 /**
133 * @param {string} rawText
134 * @returns {{
135 * enrich_version: number,
136 * summary: string,
137 * suggested_labels: string[],
138 * suggested_frontmatter: Record<string, unknown>,
139 * parseOk: boolean,
140 * }}
141 */
142 export function parseEnrichModelOutput(rawText) {
143 const raw = rawText != null ? String(rawText) : '';
144 let summary = raw.trim();
145 const suggested_labels = [];
146 /** @type {Record<string, unknown>} */
147 const suggested_frontmatter = {};
148 let enrich_version = 1;
149 let parseOk = false;
150 try {
151 const cleaned = raw.replace(/^```(?:json)?\s*/i, '').replace(/\s*```$/m, '').trim();
152 const j = JSON.parse(cleaned);
153 if (j && typeof j === 'object' && !Array.isArray(j)) {
154 parseOk = true;
155 if (typeof j.summary === 'string') summary = j.summary;
156 if (typeof j.enrich_version === 'number' && Number.isFinite(j.enrich_version)) {
157 enrich_version = j.enrich_version;
158 }
159 if (Array.isArray(j.suggested_labels)) {
160 for (const x of j.suggested_labels) {
161 suggested_labels.push(String(x));
162 }
163 }
164 const sf = j.suggested_frontmatter;
165 if (sf && typeof sf === 'object' && !Array.isArray(sf)) {
166 for (const [k, v] of Object.entries(sf)) {
167 suggested_frontmatter[k] = v;
168 }
169 }
170 }
171 } catch (_) {
172 /* keep summary = raw */
173 }
174 return {
175 enrich_version,
176 summary: clampStr(summary, MAX_SUMMARY_CHARS),
177 suggested_labels,
178 suggested_frontmatter,
179 parseOk,
180 };
181 }
182
183 /**
184 * Normalize labels for storage (slug-like tags).
185 * @param {string[]} labels
186 */
187 export function normalizeSuggestedLabels(labels) {
188 if (!Array.isArray(labels)) return [];
189 const out = normalizeTags(labels.map((x) => String(x))).filter(Boolean);
190 return [...new Set(out)].slice(0, MAX_LABELS).map((t) => (t.length > MAX_LABEL_LEN ? t.slice(0, MAX_LABEL_LEN) : t));
191 }
192
193 /**
194 * @param {Record<string, unknown>} raw
195 * @returns {Record<string, unknown>}
196 */
197 export function validateAndNormalizeSuggestedFrontmatter(raw) {
198 if (!raw || typeof raw !== 'object' || Array.isArray(raw)) return {};
199 /** @type {Record<string, unknown>} */
200 const out = {};
201 for (const [key, val] of Object.entries(raw)) {
202 if (!SUGGESTED_FRONTMATTER_KEYS.has(key) || isForbiddenKey(key)) continue;
203 if (key === 'tags') {
204 const tags = normalizeTags(Array.isArray(val) ? val : val != null ? [String(val)] : []);
205 if (tags.length) out.tags = tags.slice(0, 32);
206 continue;
207 }
208 if (key === 'entity') {
209 if (Array.isArray(val)) {
210 const ents = val.map((x) => normalizeSlug(String(x))).filter(Boolean).slice(0, 32);
211 if (ents.length) out.entity = ents.length === 1 ? ents[0] : ents;
212 } else if (val != null && String(val).trim()) {
213 const e = normalizeSlug(String(val));
214 if (e) out.entity = e;
215 }
216 continue;
217 }
218 if (key === 'project' || key === 'causal_chain_id' || key === 'episode_id') {
219 const s = normalizeSlug(String(val ?? ''));
220 if (s) out[key] = s;
221 continue;
222 }
223 if (key === 'follows' || key === 'summarizes') {
224 if (Array.isArray(val)) {
225 const paths = [];
226 for (const p of val) {
227 const ps = String(p).trim();
228 if (isSafeVaultPathLike(ps)) paths.push(ps.replace(/\\/g, '/'));
229 }
230 if (paths.length) out[key] = paths.length === 1 ? paths[0] : paths.slice(0, 32);
231 } else if (val != null) {
232 const ps = String(val).trim();
233 if (isSafeVaultPathLike(ps)) out[key] = ps.replace(/\\/g, '/');
234 }
235 continue;
236 }
237 if (key === 'state_snapshot') {
238 if (val === true || val === false) out.state_snapshot = val;
239 else if (val === 'true' || val === 'false') out.state_snapshot = val === 'true';
240 continue;
241 }
242 if (key === 'title') {
243 const s = clampStr(String(val ?? ''), MAX_TITLE_CHARS);
244 if (s) out.title = s;
245 continue;
246 }
247 if (key === 'intent') {
248 const s = clampStr(String(val ?? ''), MAX_INTENT_CHARS);
249 if (s) out.intent = s;
250 continue;
251 }
252 if (key === 'date' || key === 'updated' || key === 'source' || key === 'source_id' || key === 'summarizes_range') {
253 const s = clampStr(String(val ?? ''), MAX_SCALAR_CHARS);
254 if (s) out[key] = s;
255 }
256 }
257 return out;
258 }
259
260 /**
261 * Full pipeline after LLM returns raw text.
262 * @param {string} rawText
263 */
264 export function validateAndNormalizeEnrichResult(rawText) {
265 const parsed = parseEnrichModelOutput(rawText);
266 const suggested_labels = normalizeSuggestedLabels(parsed.suggested_labels);
267 const suggested_frontmatter = validateAndNormalizeSuggestedFrontmatter(parsed.suggested_frontmatter);
268 let jsonSize = 0;
269 try {
270 jsonSize = JSON.stringify(suggested_frontmatter).length;
271 } catch (_) {
272 jsonSize = MAX_JSON_OUTPUT_CHARS + 1;
273 }
274 let fm = suggested_frontmatter;
275 if (jsonSize > MAX_JSON_OUTPUT_CHARS) {
276 fm = {};
277 }
278 return {
279 enrich_version: parsed.enrich_version,
280 summary: parsed.summary,
281 suggested_labels,
282 suggested_frontmatter: fm,
283 parseOk: parsed.parseOk,
284 };
285 }
286
287 /**
288 * Serialize normalized frontmatter for canister / API (bounded).
289 * @param {Record<string, unknown>} fm
290 */
291 export function serializeSuggestedFrontmatterJson(fm) {
292 try {
293 const s = JSON.stringify(fm == null ? {} : fm);
294 if (s.length > MAX_JSON_OUTPUT_CHARS) return '{}';
295 return s;
296 } catch (_) {
297 return '{}';
298 }
299 }
300
301 export { MAX_JSON_OUTPUT_CHARS as ENRICH_SUGGESTED_FRONTMATTER_MAX_JSON_CHARS };
File History 2 commits
sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd feat(calendar): enforce agent context tiers in retrieval AP… Human minor 2 days ago
sha256:9103f98c89257ed2b01c237cea895dabb3e85ea337dccb1161c175e4422355b6 docs: accept Calendar Events v0 spec with Phase 0 security … Human 2 days ago