scripts/validate-deepinfra-enrich.mjs · aaronrene/knowtation — MuseHub

aaronrene / knowtation public

validate-deepinfra-enrich.mjs

337 lines 11.5 KB

Raw

sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd feat(calendar): enforce agent context tiers in retrieval AP… Human minor ⚠ breaking 1 day ago

1	#!/usr/bin/env node
2	/**
3	* DeepInfra Enrich validation harness.
4	*
5	* Purpose: before flipping the hosted Hub gateway from OpenAI to DeepInfra,
6	* verify that the chosen DeepInfra chat model returns JSON that
7	* `validateAndNormalizeEnrichResult` can parse for a representative spread
8	* of proposal-shaped inputs. Fail fast (exit 1) if any sample fails to parse,
9	* since downstream the canister will store empty `suggested_frontmatter` JSON
10	* and reviewers lose the metadata-suggestion benefit.
11	*
12	* Usage (Track A — flip explicit provider):
13	*
14	* DEEPINFRA_API_KEY=di-...\
15	* KNOWTATION_CHAT_PROVIDER=deepinfra \
16	* DEEPINFRA_CHAT_MODEL=Qwen/Qwen2.5-72B-Instruct \
17	* node scripts/validate-deepinfra-enrich.mjs
18	*
19	* Usage (Track B — control: re-run against current OpenAI to compare):
20	*
21	* OPENAI_API_KEY=sk-...\
22	* KNOWTATION_CHAT_PROVIDER=openai \
23	* node scripts/validate-deepinfra-enrich.mjs
24	*
25	* Exit code 0 = all samples passed; 1 = at least one failed to parse or
26	* produced fields outside the SPEC §2 allow-list.
27	*
28	* Privacy: each sample body below is synthetic. No vault data is sent.
29	* If you want to stress-test against real proposal bodies, pass
30	* --vault-sample <path-to-md> [--vault-sample <path>...]
31	* (paths are read with fs.readFileSync — they never leave your machine until
32	* the LLM call goes out, which you have already approved by setting the API key).
33	*/
34
35	import fs from 'node:fs';
36	import path from 'node:path';
37	import { fileURLToPath } from 'node:url';
38	import { completeChat } from '../lib/llm-complete.mjs';
39	import {
40	buildEnrichMessages,
41	validateAndNormalizeEnrichResult,
42	SUGGESTED_FRONTMATTER_KEYS,
43	} from '../lib/proposal-enrich-llm.mjs';
44
45	const __dirname = path.dirname(fileURLToPath(import.meta.url));
46
47	/** @type {Array<{ label: string, input: { path: string, intent: string, body: string } }>} */
48	const SAMPLES = [
49	{
50	label: 'short-paragraph',
51	input: {
52	path: 'projects/example/inbox/quick-thought.md',
53	intent: 'capture a quick observation about onboarding friction',
54	body:
55	'Users on the trial plan are dropping off after the first invite step. ' +
56	'It seems related to the email confirmation delay. We should add a resend button.',
57	},
58	},
59	{
60	label: 'long-technical',
61	input: {
62	path: 'projects/example/research/auth-rotation-2026-04.md',
63	intent: 'research note on JWT rotation strategy',
64	body: [
65	'# JWT rotation review',
66	'',
67	'## Background',
68	'Hosted Hub currently issues 24h JWTs (HUB_JWT_EXPIRY default). Rotation is manual.',
69	'',
70	'## Options considered',
71	'1. Sliding 7d refresh token + 1h access token (industry standard).',
72	'2. Stateless 24h JWT with mandatory client re-login (current).',
73	'3. Server-side revocation list (cost: cache miss + DB hit).',
74	'',
75	'## Decision',
76	'Move to option 1 in Q3 2026. Track in causal-chain `auth-rotation-2026`.',
77	'',
78	'## Risks',
79	'- Refresh-token theft via XSS (mitigate with httpOnly cookies).',
80	'- Logout requires server-side denylist (1h TTL bounded).',
81	].join('\n'),
82	},
83	},
84	{
85	label: 'with-project-frontmatter-wording',
86	input: {
87	path: 'projects/born-free/outlines/landing-hero-2026-05.md',
88	intent: 'outline for the May 2026 landing hero refresh',
89	body:
90	'Project: Born Free. Audience: families and adventurers who want community-owned property access. ' +
91	'Hero CTA: claim Experience Key with code BORNFREE100. Tone: warm, partner-focused, never paid-influencer.',
92	},
93	},
94	{
95	label: 'bulleted-only',
96	input: {
97	path: 'projects/example/playbooks/launch-checklist.md',
98	intent: 'pre-launch operational checklist',
99	body: [
100	'- Confirm DEEPINFRA_API_KEY rotated in Netlify',
101	'- Run Enrich validation harness (this script)',
102	'- Smoke test create proposal in staging',
103	'- Watch error rate dashboard for 1h',
104	'- If ok, flip production env vars',
105	].join('\n'),
106	},
107	},
108	{
109	label: 'code-blocks',
110	input: {
111	path: 'projects/knowtation/research/embedding-dim-table.md',
112	intent: 'reference for embedding dimensions per provider',
113	body: [
114	'Common embedding dimensions:',
115	'',
116	'```',
117	'openai/text-embedding-3-small 1536',
118	'openai/text-embedding-3-large 3072',
119	'voyage/voyage-4-lite 1024',
120	'deepinfra/bge-large-en-v1.5 1024',
121	'deepinfra/Qwen3-Embedding-8B 4096',
122	'```',
123	].join('\n'),
124	},
125	},
126	{
127	label: 'date-references',
128	input: {
129	path: 'projects/example/decisions/2026-04-15-llm-provider.md',
130	intent: 'decision record for switching chat provider',
131	body:
132	'On 2026-04-15 we decided to flip hosted Hub chat from OpenAI to DeepInfra. ' +
133	'Effective date: 2026-05-01 after staging validation. Updated: 2026-04-30. ' +
134	'Source: docs/NEXT-SESSION-HUB-LLM-COST-ROUTING.md.',
135	},
136	},
137	{
138	label: 'named-entities',
139	input: {
140	path: 'projects/born-free/research/competitive-snapshot-2026-04.md',
141	intent: 'competitive snapshot of community-owned travel platforms',
142	body:
143	'Competitors: Kibbo (community RV access), Inspirato (luxury subscription), DAOhaus governance template. ' +
144	'Differentiators for Born Free: credits renew for life, DAO governance, member NFT, partner not promoter.',
145	},
146	},
147	{
148	label: 'causal-chain',
149	input: {
150	path: 'projects/example/incidents/2026-04-22-hint-timeout.md',
151	intent: 'post-mortem for review-hints timeout incident',
152	body:
153	'Incident chain `hosted-hint-timeout-2026-q2` follows from earlier note ' +
154	'projects/example/incidents/2026-04-10-canister-cold-start.md. ' +
155	'Root cause: extra canister GET inside the 18s race. Fixed by merging client body into the hints job.',
156	},
157	},
158	{
159	label: 'edge-empty-intent',
160	input: {
161	path: 'projects/example/inbox/random.md',
162	intent: '',
163	body: 'one line note',
164	},
165	},
166	{
167	label: 'structured-table',
168	input: {
169	path: 'projects/example/research/provider-cost-2026-04.md',
170	intent: 'cost comparison table',
171	body: [
172	'\| Provider \| Chat (per 1M tok) \| Embed (per 1M tok) \|',
173	'\|-----------\|-------------------\|---------------------\|',
174	'\| OpenAI \| 0.15 \| 0.02 \|',
175	'\| DeepInfra \| 0.05 \| 0.005 \|',
176	'\| Voyage \| n/a \| 0.05 \|',
177	].join('\n'),
178	},
179	},
180	];
181
182	/**
183	* Optional: append samples loaded from real .md files (paths via --vault-sample).
184	* Body becomes the file content; path/intent come from filename and `intent:` line if present.
185	*/
186	function loadVaultSamples(filePaths) {
187	const out = [];
188	for (const p of filePaths) {
189	let body = '';
190	try {
191	body = fs.readFileSync(p, 'utf8');
192	} catch (e) {
193	console.error(`[skip] cannot read ${p}: ${e.message}`);
194	continue;
195	}
196	const intentMatch = body.match(/^intent:\s*(.+)$/m);
197	out.push({
198	label: `vault:${path.basename(p)}`,
199	input: {
200	path: p.replace(/^.*?\/vault\//, 'vault/'),
201	intent: intentMatch ? intentMatch[1].trim() : 'imported vault sample',
202	body,
203	},
204	});
205	}
206	return out;
207	}
208
209	function parseArgs(argv) {
210	const out = { vaultSamples: [], passThreshold: 10 };
211	for (let i = 2; i < argv.length; i++) {
212	if (argv[i] === '--vault-sample' && argv[i + 1]) {
213	out.vaultSamples.push(argv[++i]);
214	} else if (argv[i] === '--pass-threshold' && argv[i + 1]) {
215	out.passThreshold = parseInt(argv[++i], 10) \|\| 10;
216	}
217	}
218	return out;
219	}
220
221	function summarizeFrontmatterKeys(fm) {
222	if (!fm \|\| typeof fm !== 'object') return [];
223	return Object.keys(fm).sort();
224	}
225
226	function isSubsetOfAllowList(fm) {
227	if (!fm \|\| typeof fm !== 'object') return true;
228	for (const k of Object.keys(fm)) {
229	if (!SUGGESTED_FRONTMATTER_KEYS.has(k)) return false;
230	}
231	return true;
232	}
233
234	async function runOne(sample) {
235	const { system, user } = buildEnrichMessages(sample.input);
236	const t0 = Date.now();
237	let raw;
238	try {
239	raw = await completeChat(
240	{ llm: {} },
241	{ system, user, maxTokens: 800 },
242	);
243	} catch (e) {
244	return {
245	label: sample.label,
246	ok: false,
247	reason: `LLM call failed: ${e.message \|\| String(e)}`,
248	ms: Date.now() - t0,
249	};
250	}
251	const ms = Date.now() - t0;
252	const result = validateAndNormalizeEnrichResult(raw);
253	const allowListOk = isSubsetOfAllowList(result.suggested_frontmatter);
254	const summaryOk = typeof result.summary === 'string' && result.summary.trim().length > 0;
255	const ok = result.parseOk && allowListOk && summaryOk;
256	return {
257	label: sample.label,
258	ok,
259	parseOk: result.parseOk,
260	allowListOk,
261	summaryOk,
262	summaryLen: result.summary.length,
263	labels: result.suggested_labels,
264	fmKeys: summarizeFrontmatterKeys(result.suggested_frontmatter),
265	ms,
266	rawSnippet: raw.slice(0, 200),
267	};
268	}
269
270	async function main() {
271	const args = parseArgs(process.argv);
272	const samples = [...SAMPLES, ...loadVaultSamples(args.vaultSamples)];
273
274	const provider = String(process.env.KNOWTATION_CHAT_PROVIDER \|\| '').toLowerCase();
275	const hasDeepinfra = Boolean(process.env.DEEPINFRA_API_KEY);
276	const hasOpenai = Boolean(process.env.OPENAI_API_KEY);
277	const hasAnthropic = Boolean(process.env.ANTHROPIC_API_KEY);
278
279	console.log('--- DeepInfra Enrich validation harness ---');
280	console.log(`KNOWTATION_CHAT_PROVIDER=${provider \|\| '(unset)'}`);
281	console.log(
282	`Keys: deepinfra=${hasDeepinfra ? 'set' : '(unset)'} ` +
283	`openai=${hasOpenai ? 'set' : '(unset)'} ` +
284	`anthropic=${hasAnthropic ? 'set' : '(unset)'}`,
285	);
286	console.log(`DEEPINFRA_CHAT_MODEL=${process.env.DEEPINFRA_CHAT_MODEL \|\| '(default Qwen/Qwen2.5-72B-Instruct)'}`);
287	console.log(`Samples: ${samples.length} (built-in: ${SAMPLES.length}, vault: ${args.vaultSamples.length})`);
288	console.log(`Pass threshold: ${args.passThreshold}/${samples.length}`);
289	console.log('');
290
291	if (!hasDeepinfra && !hasOpenai && !hasAnthropic) {
292	console.error(
293	'No LLM key set. Configure DEEPINFRA_API_KEY (recommended) or OPENAI_API_KEY/ANTHROPIC_API_KEY for a control run.',
294	);
295	process.exit(2);
296	}
297
298	const results = [];
299	for (const s of samples) {
300	process.stdout.write(`[${s.label}] ... `);
301	const r = await runOne(s);
302	results.push(r);
303	if (r.ok) {
304	console.log(
305	`ok parseOk=${r.parseOk} summaryLen=${r.summaryLen} labels=${r.labels.length} ` +
306	`fmKeys=[${r.fmKeys.join(',')}] ${r.ms}ms`,
307	);
308	} else {
309	console.log(`FAIL ${r.reason \|\| ''}`);
310	console.log(
311	` parseOk=${r.parseOk} allowListOk=${r.allowListOk} summaryOk=${r.summaryOk} ` +
312	`fmKeys=[${(r.fmKeys \|\| []).join(',')}] raw="${r.rawSnippet \|\| ''}"`,
313	);
314	}
315	}
316
317	const passed = results.filter((r) => r.ok).length;
318	const total = results.length;
319	console.log('');
320	console.log(`--- ${passed}/${total} samples passed ---`);
321
322	if (passed < args.passThreshold) {
323	console.error(
324	`FAIL: only ${passed}/${total} samples passed (threshold ${args.passThreshold}). ` +
325	'Do NOT flip production yet. Try a stronger model (Qwen/Qwen2.5-72B-Instruct) ' +
326	'or tighten the system prompt before promoting.',
327	);
328	process.exit(1);
329	}
330	console.log('PASS: production flip is safe for the tested model + prompt.');
331	process.exit(0);
332	}
333
334	main().catch((e) => {
335	console.error(`harness crashed: ${e.message \|\| String(e)}`);
336	process.exit(2);
337	});

File History 2 commits

sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd feat(calendar): enforce agent context tiers in retrieval AP… Human minor ⚠ 1 day ago

sha256:9103f98c89257ed2b01c237cea895dabb3e85ea337dccb1161c175e4422355b6 docs: accept Calendar Events v0 spec with Phase 0 security … Human 1 day ago