validate-deepinfra-enrich.mjs
337 lines 11.5 KB
Raw
sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd feat(calendar): enforce agent context tiers in retrieval AP… Human minor ⚠ breaking 1 day ago
1 #!/usr/bin/env node
2 /**
3 * DeepInfra Enrich validation harness.
4 *
5 * Purpose: before flipping the hosted Hub gateway from OpenAI to DeepInfra,
6 * verify that the chosen DeepInfra chat model returns JSON that
7 * `validateAndNormalizeEnrichResult` can parse for a representative spread
8 * of proposal-shaped inputs. Fail fast (exit 1) if any sample fails to parse,
9 * since downstream the canister will store empty `suggested_frontmatter` JSON
10 * and reviewers lose the metadata-suggestion benefit.
11 *
12 * Usage (Track A — flip explicit provider):
13 *
14 * DEEPINFRA_API_KEY=di-...\
15 * KNOWTATION_CHAT_PROVIDER=deepinfra \
16 * DEEPINFRA_CHAT_MODEL=Qwen/Qwen2.5-72B-Instruct \
17 * node scripts/validate-deepinfra-enrich.mjs
18 *
19 * Usage (Track B — control: re-run against current OpenAI to compare):
20 *
21 * OPENAI_API_KEY=sk-...\
22 * KNOWTATION_CHAT_PROVIDER=openai \
23 * node scripts/validate-deepinfra-enrich.mjs
24 *
25 * Exit code 0 = all samples passed; 1 = at least one failed to parse or
26 * produced fields outside the SPEC §2 allow-list.
27 *
28 * Privacy: each sample body below is synthetic. No vault data is sent.
29 * If you want to stress-test against real proposal bodies, pass
30 * --vault-sample <path-to-md> [--vault-sample <path>...]
31 * (paths are read with fs.readFileSync — they never leave your machine until
32 * the LLM call goes out, which you have already approved by setting the API key).
33 */
34
35 import fs from 'node:fs';
36 import path from 'node:path';
37 import { fileURLToPath } from 'node:url';
38 import { completeChat } from '../lib/llm-complete.mjs';
39 import {
40 buildEnrichMessages,
41 validateAndNormalizeEnrichResult,
42 SUGGESTED_FRONTMATTER_KEYS,
43 } from '../lib/proposal-enrich-llm.mjs';
44
45 const __dirname = path.dirname(fileURLToPath(import.meta.url));
46
47 /** @type {Array<{ label: string, input: { path: string, intent: string, body: string } }>} */
48 const SAMPLES = [
49 {
50 label: 'short-paragraph',
51 input: {
52 path: 'projects/example/inbox/quick-thought.md',
53 intent: 'capture a quick observation about onboarding friction',
54 body:
55 'Users on the trial plan are dropping off after the first invite step. ' +
56 'It seems related to the email confirmation delay. We should add a resend button.',
57 },
58 },
59 {
60 label: 'long-technical',
61 input: {
62 path: 'projects/example/research/auth-rotation-2026-04.md',
63 intent: 'research note on JWT rotation strategy',
64 body: [
65 '# JWT rotation review',
66 '',
67 '## Background',
68 'Hosted Hub currently issues 24h JWTs (HUB_JWT_EXPIRY default). Rotation is manual.',
69 '',
70 '## Options considered',
71 '1. Sliding 7d refresh token + 1h access token (industry standard).',
72 '2. Stateless 24h JWT with mandatory client re-login (current).',
73 '3. Server-side revocation list (cost: cache miss + DB hit).',
74 '',
75 '## Decision',
76 'Move to option 1 in Q3 2026. Track in causal-chain `auth-rotation-2026`.',
77 '',
78 '## Risks',
79 '- Refresh-token theft via XSS (mitigate with httpOnly cookies).',
80 '- Logout requires server-side denylist (1h TTL bounded).',
81 ].join('\n'),
82 },
83 },
84 {
85 label: 'with-project-frontmatter-wording',
86 input: {
87 path: 'projects/born-free/outlines/landing-hero-2026-05.md',
88 intent: 'outline for the May 2026 landing hero refresh',
89 body:
90 'Project: Born Free. Audience: families and adventurers who want community-owned property access. ' +
91 'Hero CTA: claim Experience Key with code BORNFREE100. Tone: warm, partner-focused, never paid-influencer.',
92 },
93 },
94 {
95 label: 'bulleted-only',
96 input: {
97 path: 'projects/example/playbooks/launch-checklist.md',
98 intent: 'pre-launch operational checklist',
99 body: [
100 '- Confirm DEEPINFRA_API_KEY rotated in Netlify',
101 '- Run Enrich validation harness (this script)',
102 '- Smoke test create proposal in staging',
103 '- Watch error rate dashboard for 1h',
104 '- If ok, flip production env vars',
105 ].join('\n'),
106 },
107 },
108 {
109 label: 'code-blocks',
110 input: {
111 path: 'projects/knowtation/research/embedding-dim-table.md',
112 intent: 'reference for embedding dimensions per provider',
113 body: [
114 'Common embedding dimensions:',
115 '',
116 '```',
117 'openai/text-embedding-3-small 1536',
118 'openai/text-embedding-3-large 3072',
119 'voyage/voyage-4-lite 1024',
120 'deepinfra/bge-large-en-v1.5 1024',
121 'deepinfra/Qwen3-Embedding-8B 4096',
122 '```',
123 ].join('\n'),
124 },
125 },
126 {
127 label: 'date-references',
128 input: {
129 path: 'projects/example/decisions/2026-04-15-llm-provider.md',
130 intent: 'decision record for switching chat provider',
131 body:
132 'On 2026-04-15 we decided to flip hosted Hub chat from OpenAI to DeepInfra. ' +
133 'Effective date: 2026-05-01 after staging validation. Updated: 2026-04-30. ' +
134 'Source: docs/NEXT-SESSION-HUB-LLM-COST-ROUTING.md.',
135 },
136 },
137 {
138 label: 'named-entities',
139 input: {
140 path: 'projects/born-free/research/competitive-snapshot-2026-04.md',
141 intent: 'competitive snapshot of community-owned travel platforms',
142 body:
143 'Competitors: Kibbo (community RV access), Inspirato (luxury subscription), DAOhaus governance template. ' +
144 'Differentiators for Born Free: credits renew for life, DAO governance, member NFT, partner not promoter.',
145 },
146 },
147 {
148 label: 'causal-chain',
149 input: {
150 path: 'projects/example/incidents/2026-04-22-hint-timeout.md',
151 intent: 'post-mortem for review-hints timeout incident',
152 body:
153 'Incident chain `hosted-hint-timeout-2026-q2` follows from earlier note ' +
154 'projects/example/incidents/2026-04-10-canister-cold-start.md. ' +
155 'Root cause: extra canister GET inside the 18s race. Fixed by merging client body into the hints job.',
156 },
157 },
158 {
159 label: 'edge-empty-intent',
160 input: {
161 path: 'projects/example/inbox/random.md',
162 intent: '',
163 body: 'one line note',
164 },
165 },
166 {
167 label: 'structured-table',
168 input: {
169 path: 'projects/example/research/provider-cost-2026-04.md',
170 intent: 'cost comparison table',
171 body: [
172 '| Provider | Chat (per 1M tok) | Embed (per 1M tok) |',
173 '|-----------|-------------------|---------------------|',
174 '| OpenAI | 0.15 | 0.02 |',
175 '| DeepInfra | 0.05 | 0.005 |',
176 '| Voyage | n/a | 0.05 |',
177 ].join('\n'),
178 },
179 },
180 ];
181
182 /**
183 * Optional: append samples loaded from real .md files (paths via --vault-sample).
184 * Body becomes the file content; path/intent come from filename and `intent:` line if present.
185 */
186 function loadVaultSamples(filePaths) {
187 const out = [];
188 for (const p of filePaths) {
189 let body = '';
190 try {
191 body = fs.readFileSync(p, 'utf8');
192 } catch (e) {
193 console.error(`[skip] cannot read ${p}: ${e.message}`);
194 continue;
195 }
196 const intentMatch = body.match(/^intent:\s*(.+)$/m);
197 out.push({
198 label: `vault:${path.basename(p)}`,
199 input: {
200 path: p.replace(/^.*?\/vault\//, 'vault/'),
201 intent: intentMatch ? intentMatch[1].trim() : 'imported vault sample',
202 body,
203 },
204 });
205 }
206 return out;
207 }
208
209 function parseArgs(argv) {
210 const out = { vaultSamples: [], passThreshold: 10 };
211 for (let i = 2; i < argv.length; i++) {
212 if (argv[i] === '--vault-sample' && argv[i + 1]) {
213 out.vaultSamples.push(argv[++i]);
214 } else if (argv[i] === '--pass-threshold' && argv[i + 1]) {
215 out.passThreshold = parseInt(argv[++i], 10) || 10;
216 }
217 }
218 return out;
219 }
220
221 function summarizeFrontmatterKeys(fm) {
222 if (!fm || typeof fm !== 'object') return [];
223 return Object.keys(fm).sort();
224 }
225
226 function isSubsetOfAllowList(fm) {
227 if (!fm || typeof fm !== 'object') return true;
228 for (const k of Object.keys(fm)) {
229 if (!SUGGESTED_FRONTMATTER_KEYS.has(k)) return false;
230 }
231 return true;
232 }
233
234 async function runOne(sample) {
235 const { system, user } = buildEnrichMessages(sample.input);
236 const t0 = Date.now();
237 let raw;
238 try {
239 raw = await completeChat(
240 { llm: {} },
241 { system, user, maxTokens: 800 },
242 );
243 } catch (e) {
244 return {
245 label: sample.label,
246 ok: false,
247 reason: `LLM call failed: ${e.message || String(e)}`,
248 ms: Date.now() - t0,
249 };
250 }
251 const ms = Date.now() - t0;
252 const result = validateAndNormalizeEnrichResult(raw);
253 const allowListOk = isSubsetOfAllowList(result.suggested_frontmatter);
254 const summaryOk = typeof result.summary === 'string' && result.summary.trim().length > 0;
255 const ok = result.parseOk && allowListOk && summaryOk;
256 return {
257 label: sample.label,
258 ok,
259 parseOk: result.parseOk,
260 allowListOk,
261 summaryOk,
262 summaryLen: result.summary.length,
263 labels: result.suggested_labels,
264 fmKeys: summarizeFrontmatterKeys(result.suggested_frontmatter),
265 ms,
266 rawSnippet: raw.slice(0, 200),
267 };
268 }
269
270 async function main() {
271 const args = parseArgs(process.argv);
272 const samples = [...SAMPLES, ...loadVaultSamples(args.vaultSamples)];
273
274 const provider = String(process.env.KNOWTATION_CHAT_PROVIDER || '').toLowerCase();
275 const hasDeepinfra = Boolean(process.env.DEEPINFRA_API_KEY);
276 const hasOpenai = Boolean(process.env.OPENAI_API_KEY);
277 const hasAnthropic = Boolean(process.env.ANTHROPIC_API_KEY);
278
279 console.log('--- DeepInfra Enrich validation harness ---');
280 console.log(`KNOWTATION_CHAT_PROVIDER=${provider || '(unset)'}`);
281 console.log(
282 `Keys: deepinfra=${hasDeepinfra ? 'set' : '(unset)'} ` +
283 `openai=${hasOpenai ? 'set' : '(unset)'} ` +
284 `anthropic=${hasAnthropic ? 'set' : '(unset)'}`,
285 );
286 console.log(`DEEPINFRA_CHAT_MODEL=${process.env.DEEPINFRA_CHAT_MODEL || '(default Qwen/Qwen2.5-72B-Instruct)'}`);
287 console.log(`Samples: ${samples.length} (built-in: ${SAMPLES.length}, vault: ${args.vaultSamples.length})`);
288 console.log(`Pass threshold: ${args.passThreshold}/${samples.length}`);
289 console.log('');
290
291 if (!hasDeepinfra && !hasOpenai && !hasAnthropic) {
292 console.error(
293 'No LLM key set. Configure DEEPINFRA_API_KEY (recommended) or OPENAI_API_KEY/ANTHROPIC_API_KEY for a control run.',
294 );
295 process.exit(2);
296 }
297
298 const results = [];
299 for (const s of samples) {
300 process.stdout.write(`[${s.label}] ... `);
301 const r = await runOne(s);
302 results.push(r);
303 if (r.ok) {
304 console.log(
305 `ok parseOk=${r.parseOk} summaryLen=${r.summaryLen} labels=${r.labels.length} ` +
306 `fmKeys=[${r.fmKeys.join(',')}] ${r.ms}ms`,
307 );
308 } else {
309 console.log(`FAIL ${r.reason || ''}`);
310 console.log(
311 ` parseOk=${r.parseOk} allowListOk=${r.allowListOk} summaryOk=${r.summaryOk} ` +
312 `fmKeys=[${(r.fmKeys || []).join(',')}] raw="${r.rawSnippet || ''}"`,
313 );
314 }
315 }
316
317 const passed = results.filter((r) => r.ok).length;
318 const total = results.length;
319 console.log('');
320 console.log(`--- ${passed}/${total} samples passed ---`);
321
322 if (passed < args.passThreshold) {
323 console.error(
324 `FAIL: only ${passed}/${total} samples passed (threshold ${args.passThreshold}). ` +
325 'Do NOT flip production yet. Try a stronger model (Qwen/Qwen2.5-72B-Instruct) ' +
326 'or tighten the system prompt before promoting.',
327 );
328 process.exit(1);
329 }
330 console.log('PASS: production flip is safe for the tested model + prompt.');
331 process.exit(0);
332 }
333
334 main().catch((e) => {
335 console.error(`harness crashed: ${e.message || String(e)}`);
336 process.exit(2);
337 });
File History 2 commits
sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd feat(calendar): enforce agent context tiers in retrieval AP… Human minor 1 day ago
sha256:9103f98c89257ed2b01c237cea895dabb3e85ea337dccb1161c175e4422355b6 docs: accept Calendar Events v0 spec with Phase 0 security … Human 1 day ago