bridge-index-cache-contract.test.mjs
sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd
feat(calendar): enforce agent context tiers in retrieval AP…
Human
minor
⚠ breaking
1 day ago
| 1 | /** |
| 2 | * Contract tests for the `feat/bridge-embed-hash-cache` wiring inside |
| 3 | * `hub/bridge/server.mjs POST /api/v1/index`. The handler is too tightly coupled |
| 4 | * to Netlify Blobs / canister export / live embedding to boot in a Node test, so |
| 5 | * we lock in the static wiring that PR 1 introduces with source-string asserts: |
| 6 | * |
| 7 | * - imports the right helpers (computeChunkContentHashTagged, partitionChunksForReindex, |
| 8 | * runWithConcurrency, parseEmbedConcurrency, parseEmbedBatchSize); |
| 9 | * - calls store.getChunkHashes(vaultId) for cache lookup; |
| 10 | * - upsert payload includes content_hash; |
| 11 | * - response includes chunksSkippedCached + chunksEmbedded (so the UI can show savings); |
| 12 | * - timer logs cache_lookup step (post-mortem signal); |
| 13 | * - BATCH_EMBED + EMBED_CONCURRENCY are env-tunable, not hard-coded 10. |
| 14 | * |
| 15 | * If any of these regress in a future refactor, post-PR1 re-indexes silently fall |
| 16 | * back to "embed everything" — the exact failure mode this PR exists to eliminate. |
| 17 | */ |
| 18 | |
| 19 | import { readFileSync } from 'node:fs'; |
| 20 | import { fileURLToPath } from 'node:url'; |
| 21 | import { dirname, join } from 'node:path'; |
| 22 | import test from 'node:test'; |
| 23 | import assert from 'node:assert/strict'; |
| 24 | |
| 25 | const root = join(dirname(fileURLToPath(import.meta.url)), '..'); |
| 26 | const bridgeJs = readFileSync(join(root, 'hub/bridge/server.mjs'), 'utf8'); |
| 27 | |
| 28 | test('bridge imports content-hash + parallel-pool + partition helpers', () => { |
| 29 | assert.match( |
| 30 | bridgeJs, |
| 31 | /from\s+['"]\.\.\/\.\.\/lib\/chunk-content-hash\.mjs['"]/, |
| 32 | 'must import from lib/chunk-content-hash.mjs', |
| 33 | ); |
| 34 | assert.match( |
| 35 | bridgeJs, |
| 36 | /from\s+['"]\.\.\/\.\.\/lib\/parallel-embed-pool\.mjs['"]/, |
| 37 | 'must import from lib/parallel-embed-pool.mjs', |
| 38 | ); |
| 39 | assert.match( |
| 40 | bridgeJs, |
| 41 | /from\s+['"]\.\.\/\.\.\/lib\/index-partition\.mjs['"]/, |
| 42 | 'must import from lib/index-partition.mjs', |
| 43 | ); |
| 44 | assert.match(bridgeJs, /\bcomputeChunkContentHashTagged\b/); |
| 45 | assert.match(bridgeJs, /\brunWithConcurrency\b/); |
| 46 | assert.match(bridgeJs, /\bpartitionChunksForReindex\b/); |
| 47 | }); |
| 48 | |
| 49 | test('BATCH_EMBED + EMBED_CONCURRENCY are derived from env (not hard-coded 10)', () => { |
| 50 | assert.match( |
| 51 | bridgeJs, |
| 52 | /parseEmbedBatchSize\(\s*process\.env\.INDEXER_EMBED_BATCH_SIZE\s*\)/, |
| 53 | 'INDEXER_EMBED_BATCH_SIZE must drive the batch size', |
| 54 | ); |
| 55 | assert.match( |
| 56 | bridgeJs, |
| 57 | /parseEmbedConcurrency\(\s*process\.env\.INDEXER_EMBED_CONCURRENCY\s*\)/, |
| 58 | 'INDEXER_EMBED_CONCURRENCY must drive parallelism', |
| 59 | ); |
| 60 | // The legacy serial-loop constant `const BATCH_EMBED = 10` is gone. |
| 61 | assert.doesNotMatch( |
| 62 | bridgeJs, |
| 63 | /^\s*const\s+BATCH_EMBED\s*=\s*10\s*;/m, |
| 64 | 'BATCH_EMBED = 10 is the pre-PR sequential-loop default; must not return', |
| 65 | ); |
| 66 | }); |
| 67 | |
| 68 | test('handler queries store.getChunkHashes(vaultId) for cache lookup', () => { |
| 69 | assert.match( |
| 70 | bridgeJs, |
| 71 | /store\.getChunkHashes\s*\(\s*vaultId\s*\)/, |
| 72 | 'index handler must call store.getChunkHashes(vaultId) to populate the cache', |
| 73 | ); |
| 74 | }); |
| 75 | |
| 76 | test('upsert payload includes content_hash so future runs can hit the cache', () => { |
| 77 | // Look in a window around the upsert payload object literal. |
| 78 | const upsertWindow = bridgeJs.match( |
| 79 | /points\s*=\s*slice\.map\([\s\S]{0,1500}?content_hash:\s*item\.contentHash/, |
| 80 | ); |
| 81 | assert.ok( |
| 82 | upsertWindow, |
| 83 | 'upsert payload must include content_hash: item.contentHash for the cache to populate', |
| 84 | ); |
| 85 | }); |
| 86 | |
| 87 | test('response + timer surface chunksSkippedCached and chunksEmbedded', () => { |
| 88 | assert.match( |
| 89 | bridgeJs, |
| 90 | /chunksSkippedCached:\s*chunks_skipped_cached/, |
| 91 | 'response JSON must expose chunksSkippedCached', |
| 92 | ); |
| 93 | assert.match( |
| 94 | bridgeJs, |
| 95 | /chunksEmbedded:\s*toEmbed\.length/, |
| 96 | 'response JSON must expose chunksEmbedded', |
| 97 | ); |
| 98 | assert.match( |
| 99 | bridgeJs, |
| 100 | /timer\.step\(['"]cache_lookup['"]/, |
| 101 | 'timer must emit cache_lookup step (post-mortem signal for cache hit rate)', |
| 102 | ); |
| 103 | }); |
| 104 | |
| 105 | test('parallel embed loop uses runWithConcurrency, not a serial for-of/await', () => { |
| 106 | assert.match( |
| 107 | bridgeJs, |
| 108 | /runWithConcurrency\(\s*embedBatches\.map/, |
| 109 | 'embed step must call runWithConcurrency with embedBatches.map of thunks', |
| 110 | ); |
| 111 | }); |
| 112 | |
| 113 | test('getBridgeStoreConfig sets allow_dimension_migration: true so a provider switch unblocks the indexer', () => { |
| 114 | // Without this, an OpenAI(1536) → DeepInfra(1024) switch leaves a 1536-dim table on disk and |
| 115 | // every subsequent `POST /api/v1/index` aborts with "Vector store dimension mismatch" before |
| 116 | // the new content-hash migration logic can run. The bridge owns the data lifecycle (downloads |
| 117 | // from blob → re-indexes → uploads to blob), so an automatic drop+recreate is the only correct |
| 118 | // resolution. CLI keeps the throw — see `lib/vector-store-sqlite.mjs:ensureCollection`. |
| 119 | assert.match( |
| 120 | bridgeJs, |
| 121 | /function getBridgeStoreConfig[\s\S]{0,1500}?allow_dimension_migration:\s*true/, |
| 122 | 'getBridgeStoreConfig must set allow_dimension_migration: true', |
| 123 | ); |
| 124 | }); |
| 125 | |
| 126 | test('chunk content-hash is bound to the active embedding provider+model', () => { |
| 127 | // The hash call site MUST pass the embedding config. Without it, |
| 128 | // `computeChunkContentHashTagged` throws (loud caller bug) — but the test asserts the |
| 129 | // intentional wiring so a future refactor cannot silently drop the second arg and re-introduce |
| 130 | // the same-dimension model-swap silent corruption (BGE-large 1024 → BGE-m3 1024 etc.). |
| 131 | assert.match( |
| 132 | bridgeJs, |
| 133 | /computeChunkContentHashTagged\(\s*chunk\s*,\s*embeddingConfigForHash\s*\)/, |
| 134 | 'bridge must call computeChunkContentHashTagged(chunk, embeddingConfigForHash) so a model swap invalidates the cache', |
| 135 | ); |
| 136 | }); |
File History
2 commits
sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd
feat(calendar): enforce agent context tiers in retrieval AP…
Human
minor
⚠
1 day ago
sha256:9103f98c89257ed2b01c237cea895dabb3e85ea337dccb1161c175e4422355b6
docs: accept Calendar Events v0 spec with Phase 0 security …
Human
1 day ago