bridge-index-cache-contract.test.mjs
136 lines 5.5 KB
Raw
sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd feat(calendar): enforce agent context tiers in retrieval AP… Human minor ⚠ breaking 1 day ago
1 /**
2 * Contract tests for the `feat/bridge-embed-hash-cache` wiring inside
3 * `hub/bridge/server.mjs POST /api/v1/index`. The handler is too tightly coupled
4 * to Netlify Blobs / canister export / live embedding to boot in a Node test, so
5 * we lock in the static wiring that PR 1 introduces with source-string asserts:
6 *
7 * - imports the right helpers (computeChunkContentHashTagged, partitionChunksForReindex,
8 * runWithConcurrency, parseEmbedConcurrency, parseEmbedBatchSize);
9 * - calls store.getChunkHashes(vaultId) for cache lookup;
10 * - upsert payload includes content_hash;
11 * - response includes chunksSkippedCached + chunksEmbedded (so the UI can show savings);
12 * - timer logs cache_lookup step (post-mortem signal);
13 * - BATCH_EMBED + EMBED_CONCURRENCY are env-tunable, not hard-coded 10.
14 *
15 * If any of these regress in a future refactor, post-PR1 re-indexes silently fall
16 * back to "embed everything" — the exact failure mode this PR exists to eliminate.
17 */
18
19 import { readFileSync } from 'node:fs';
20 import { fileURLToPath } from 'node:url';
21 import { dirname, join } from 'node:path';
22 import test from 'node:test';
23 import assert from 'node:assert/strict';
24
25 const root = join(dirname(fileURLToPath(import.meta.url)), '..');
26 const bridgeJs = readFileSync(join(root, 'hub/bridge/server.mjs'), 'utf8');
27
28 test('bridge imports content-hash + parallel-pool + partition helpers', () => {
29 assert.match(
30 bridgeJs,
31 /from\s+['"]\.\.\/\.\.\/lib\/chunk-content-hash\.mjs['"]/,
32 'must import from lib/chunk-content-hash.mjs',
33 );
34 assert.match(
35 bridgeJs,
36 /from\s+['"]\.\.\/\.\.\/lib\/parallel-embed-pool\.mjs['"]/,
37 'must import from lib/parallel-embed-pool.mjs',
38 );
39 assert.match(
40 bridgeJs,
41 /from\s+['"]\.\.\/\.\.\/lib\/index-partition\.mjs['"]/,
42 'must import from lib/index-partition.mjs',
43 );
44 assert.match(bridgeJs, /\bcomputeChunkContentHashTagged\b/);
45 assert.match(bridgeJs, /\brunWithConcurrency\b/);
46 assert.match(bridgeJs, /\bpartitionChunksForReindex\b/);
47 });
48
49 test('BATCH_EMBED + EMBED_CONCURRENCY are derived from env (not hard-coded 10)', () => {
50 assert.match(
51 bridgeJs,
52 /parseEmbedBatchSize\(\s*process\.env\.INDEXER_EMBED_BATCH_SIZE\s*\)/,
53 'INDEXER_EMBED_BATCH_SIZE must drive the batch size',
54 );
55 assert.match(
56 bridgeJs,
57 /parseEmbedConcurrency\(\s*process\.env\.INDEXER_EMBED_CONCURRENCY\s*\)/,
58 'INDEXER_EMBED_CONCURRENCY must drive parallelism',
59 );
60 // The legacy serial-loop constant `const BATCH_EMBED = 10` is gone.
61 assert.doesNotMatch(
62 bridgeJs,
63 /^\s*const\s+BATCH_EMBED\s*=\s*10\s*;/m,
64 'BATCH_EMBED = 10 is the pre-PR sequential-loop default; must not return',
65 );
66 });
67
68 test('handler queries store.getChunkHashes(vaultId) for cache lookup', () => {
69 assert.match(
70 bridgeJs,
71 /store\.getChunkHashes\s*\(\s*vaultId\s*\)/,
72 'index handler must call store.getChunkHashes(vaultId) to populate the cache',
73 );
74 });
75
76 test('upsert payload includes content_hash so future runs can hit the cache', () => {
77 // Look in a window around the upsert payload object literal.
78 const upsertWindow = bridgeJs.match(
79 /points\s*=\s*slice\.map\([\s\S]{0,1500}?content_hash:\s*item\.contentHash/,
80 );
81 assert.ok(
82 upsertWindow,
83 'upsert payload must include content_hash: item.contentHash for the cache to populate',
84 );
85 });
86
87 test('response + timer surface chunksSkippedCached and chunksEmbedded', () => {
88 assert.match(
89 bridgeJs,
90 /chunksSkippedCached:\s*chunks_skipped_cached/,
91 'response JSON must expose chunksSkippedCached',
92 );
93 assert.match(
94 bridgeJs,
95 /chunksEmbedded:\s*toEmbed\.length/,
96 'response JSON must expose chunksEmbedded',
97 );
98 assert.match(
99 bridgeJs,
100 /timer\.step\(['"]cache_lookup['"]/,
101 'timer must emit cache_lookup step (post-mortem signal for cache hit rate)',
102 );
103 });
104
105 test('parallel embed loop uses runWithConcurrency, not a serial for-of/await', () => {
106 assert.match(
107 bridgeJs,
108 /runWithConcurrency\(\s*embedBatches\.map/,
109 'embed step must call runWithConcurrency with embedBatches.map of thunks',
110 );
111 });
112
113 test('getBridgeStoreConfig sets allow_dimension_migration: true so a provider switch unblocks the indexer', () => {
114 // Without this, an OpenAI(1536) → DeepInfra(1024) switch leaves a 1536-dim table on disk and
115 // every subsequent `POST /api/v1/index` aborts with "Vector store dimension mismatch" before
116 // the new content-hash migration logic can run. The bridge owns the data lifecycle (downloads
117 // from blob → re-indexes → uploads to blob), so an automatic drop+recreate is the only correct
118 // resolution. CLI keeps the throw — see `lib/vector-store-sqlite.mjs:ensureCollection`.
119 assert.match(
120 bridgeJs,
121 /function getBridgeStoreConfig[\s\S]{0,1500}?allow_dimension_migration:\s*true/,
122 'getBridgeStoreConfig must set allow_dimension_migration: true',
123 );
124 });
125
126 test('chunk content-hash is bound to the active embedding provider+model', () => {
127 // The hash call site MUST pass the embedding config. Without it,
128 // `computeChunkContentHashTagged` throws (loud caller bug) — but the test asserts the
129 // intentional wiring so a future refactor cannot silently drop the second arg and re-introduce
130 // the same-dimension model-swap silent corruption (BGE-large 1024 → BGE-m3 1024 etc.).
131 assert.match(
132 bridgeJs,
133 /computeChunkContentHashTagged\(\s*chunk\s*,\s*embeddingConfigForHash\s*\)/,
134 'bridge must call computeChunkContentHashTagged(chunk, embeddingConfigForHash) so a model swap invalidates the cache',
135 );
136 });
File History 2 commits
sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd feat(calendar): enforce agent context tiers in retrieval AP… Human minor 1 day ago
sha256:9103f98c89257ed2b01c237cea895dabb3e85ea337dccb1161c175e4422355b6 docs: accept Calendar Events v0 spec with Phase 0 security … Human 1 day ago