chunk-content-hash.test.mjs
sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd
feat(calendar): enforce agent context tiers in retrieval AP…
Human
minor
⚠ breaking
2 days ago
| 1 | /** |
| 2 | * Tests for `lib/chunk-content-hash.mjs`. The bridge will compare these hashes |
| 3 | * against the `+content_hash` column in sqlite-vec to decide whether to skip |
| 4 | * embedding a chunk on re-index, so the contract that matters here is: |
| 5 | * 1. Same logical input → same digest, deterministically, across processes. |
| 6 | * 2. Different text OR different search-relevant metadata → different digest. |
| 7 | * 3. Tag/entity array order is not significant (chunks built in different |
| 8 | * orders must hash identically). |
| 9 | * 4. Missing/null/undefined for optional fields hash identically (so a chunk |
| 10 | * with `tags: undefined` matches one with `tags: []`). |
| 11 | * 5. Tagged variant carries the version prefix `v1:` so future algo bumps |
| 12 | * can be detected on read without touching every callsite. |
| 13 | * 6. Bad input throws (so a bridge bug surfaces loudly, not silently). |
| 14 | */ |
| 15 | |
| 16 | import { describe, it } from 'node:test'; |
| 17 | import assert from 'node:assert/strict'; |
| 18 | import { |
| 19 | computeChunkContentHash, |
| 20 | computeChunkContentHashTagged, |
| 21 | CHUNK_CONTENT_HASH_VERSION, |
| 22 | } from '../lib/chunk-content-hash.mjs'; |
| 23 | |
| 24 | describe('computeChunkContentHash', () => { |
| 25 | it('is deterministic and returns 32 lowercase hex chars (128 bits)', () => { |
| 26 | const chunk = { text: 'hello world', path: 'notes/a.md' }; |
| 27 | const a = computeChunkContentHash(chunk); |
| 28 | const b = computeChunkContentHash(chunk); |
| 29 | assert.equal(a, b); |
| 30 | assert.match(a, /^[0-9a-f]{32}$/); |
| 31 | }); |
| 32 | |
| 33 | it('changes when text changes', () => { |
| 34 | const a = computeChunkContentHash({ text: 'one', path: 'a.md' }); |
| 35 | const b = computeChunkContentHash({ text: 'two', path: 'a.md' }); |
| 36 | assert.notEqual(a, b); |
| 37 | }); |
| 38 | |
| 39 | it('changes when path changes (since path is part of the search-relevant payload)', () => { |
| 40 | const a = computeChunkContentHash({ text: 'same', path: 'a.md' }); |
| 41 | const b = computeChunkContentHash({ text: 'same', path: 'b.md' }); |
| 42 | assert.notEqual(a, b); |
| 43 | }); |
| 44 | |
| 45 | it('changes when project, date, or tags change (so metadata edits invalidate the cache)', () => { |
| 46 | const base = { text: 'x', path: 'a.md' }; |
| 47 | const baseHash = computeChunkContentHash(base); |
| 48 | assert.notEqual(computeChunkContentHash({ ...base, project: 'p1' }), baseHash); |
| 49 | assert.notEqual(computeChunkContentHash({ ...base, date: '2026-05-01' }), baseHash); |
| 50 | assert.notEqual(computeChunkContentHash({ ...base, tags: ['x'] }), baseHash); |
| 51 | assert.notEqual( |
| 52 | computeChunkContentHash({ ...base, causal_chain_id: 'c1' }), |
| 53 | baseHash, |
| 54 | ); |
| 55 | assert.notEqual( |
| 56 | computeChunkContentHash({ ...base, episode_id: 'ep1' }), |
| 57 | baseHash, |
| 58 | ); |
| 59 | assert.notEqual(computeChunkContentHash({ ...base, entity: ['e1'] }), baseHash); |
| 60 | }); |
| 61 | |
| 62 | it('treats tag/entity arrays as sets (order-independent)', () => { |
| 63 | const a = computeChunkContentHash({ |
| 64 | text: 'x', |
| 65 | path: 'a.md', |
| 66 | tags: ['b', 'a'], |
| 67 | entity: ['z', 'y'], |
| 68 | }); |
| 69 | const b = computeChunkContentHash({ |
| 70 | text: 'x', |
| 71 | path: 'a.md', |
| 72 | tags: ['a', 'b'], |
| 73 | entity: ['y', 'z'], |
| 74 | }); |
| 75 | assert.equal(a, b); |
| 76 | }); |
| 77 | |
| 78 | it('treats undefined/null/missing equivalently for optional fields', () => { |
| 79 | const minimal = { text: 'x', path: 'a.md' }; |
| 80 | const explicitNulls = { |
| 81 | text: 'x', |
| 82 | path: 'a.md', |
| 83 | project: null, |
| 84 | tags: null, |
| 85 | date: null, |
| 86 | causal_chain_id: null, |
| 87 | entity: null, |
| 88 | episode_id: null, |
| 89 | }; |
| 90 | const explicitEmpty = { |
| 91 | text: 'x', |
| 92 | path: 'a.md', |
| 93 | project: undefined, |
| 94 | tags: [], |
| 95 | date: undefined, |
| 96 | causal_chain_id: undefined, |
| 97 | entity: [], |
| 98 | episode_id: undefined, |
| 99 | }; |
| 100 | const h1 = computeChunkContentHash(minimal); |
| 101 | const h2 = computeChunkContentHash(explicitNulls); |
| 102 | const h3 = computeChunkContentHash(explicitEmpty); |
| 103 | assert.equal(h1, h2); |
| 104 | assert.equal(h1, h3); |
| 105 | }); |
| 106 | |
| 107 | it('throws on missing chunk', () => { |
| 108 | assert.throws(() => computeChunkContentHash(null), /chunk is required/); |
| 109 | assert.throws(() => computeChunkContentHash(undefined), /chunk is required/); |
| 110 | }); |
| 111 | |
| 112 | it('throws on missing/wrong-type text or path (bridge bug must surface loudly)', () => { |
| 113 | assert.throws( |
| 114 | () => computeChunkContentHash({ path: 'a.md' }), |
| 115 | /chunk\.text must be a string/, |
| 116 | ); |
| 117 | assert.throws( |
| 118 | () => computeChunkContentHash({ text: 'x' }), |
| 119 | /chunk\.path must be a string/, |
| 120 | ); |
| 121 | assert.throws( |
| 122 | () => computeChunkContentHash({ text: 123, path: 'a.md' }), |
| 123 | /chunk\.text must be a string/, |
| 124 | ); |
| 125 | }); |
| 126 | }); |
| 127 | |
| 128 | describe('computeChunkContentHashTagged', () => { |
| 129 | const cfg = { provider: 'deepinfra', model: 'BAAI/bge-large-en-v1.5' }; |
| 130 | |
| 131 | it('format is "v<N>:<provider>:<model>:<32-hex>" with provider lowercased', () => { |
| 132 | const tagged = computeChunkContentHashTagged({ text: 'hello', path: 'a.md' }, cfg); |
| 133 | const parts = tagged.split(':'); |
| 134 | assert.equal(parts[0], 'v1'); |
| 135 | assert.equal(parts[1], 'deepinfra'); |
| 136 | assert.equal(parts[2], 'BAAI/bge-large-en-v1.5'); |
| 137 | assert.match(parts[3], /^[0-9a-f]{32}$/); |
| 138 | }); |
| 139 | |
| 140 | it('current version is v1', () => { |
| 141 | assert.equal(CHUNK_CONTENT_HASH_VERSION, 'v1'); |
| 142 | }); |
| 143 | |
| 144 | it('two equivalent chunks under same provider+model produce equal tagged hashes', () => { |
| 145 | const a = computeChunkContentHashTagged({ text: 'x', path: 'a.md', tags: ['b', 'a'] }, cfg); |
| 146 | const b = computeChunkContentHashTagged({ text: 'x', path: 'a.md', tags: ['a', 'b'] }, cfg); |
| 147 | assert.equal(a, b); |
| 148 | }); |
| 149 | |
| 150 | it('changing provider invalidates the cache (different prefix)', () => { |
| 151 | const chunk = { text: 'same', path: 'a.md' }; |
| 152 | const a = computeChunkContentHashTagged(chunk, { provider: 'openai', model: 'text-embedding-3-small' }); |
| 153 | const b = computeChunkContentHashTagged(chunk, { provider: 'deepinfra', model: 'BAAI/bge-large-en-v1.5' }); |
| 154 | assert.notEqual(a, b, 'same chunk under different providers must hash differently'); |
| 155 | }); |
| 156 | |
| 157 | it('changing model (same provider, same dimension) invalidates the cache', () => { |
| 158 | // The whole point of putting model in the prefix: BGE-large (1024) → BGE-m3 (1024) |
| 159 | // is a same-dimension swap that the dimension check cannot catch. Without model in the |
| 160 | // hash, every chunk would be a cache hit and we would silently keep stale vectors. |
| 161 | const chunk = { text: 'same', path: 'a.md' }; |
| 162 | const a = computeChunkContentHashTagged(chunk, { provider: 'deepinfra', model: 'BAAI/bge-large-en-v1.5' }); |
| 163 | const b = computeChunkContentHashTagged(chunk, { provider: 'deepinfra', model: 'BAAI/bge-m3' }); |
| 164 | assert.notEqual(a, b, 'same chunk under different models must hash differently'); |
| 165 | }); |
| 166 | |
| 167 | it('provider is lowercased + alphanumeric-stripped (deterministic across casing/typos)', () => { |
| 168 | const chunk = { text: 'x', path: 'a.md' }; |
| 169 | const a = computeChunkContentHashTagged(chunk, { provider: 'DeepInfra', model: 'm' }); |
| 170 | const b = computeChunkContentHashTagged(chunk, { provider: 'deepinfra', model: 'm' }); |
| 171 | const c = computeChunkContentHashTagged(chunk, { provider: 'deepinfra ', model: 'm' }); |
| 172 | assert.equal(a, b); |
| 173 | assert.equal(a, c); |
| 174 | }); |
| 175 | |
| 176 | it('model preserves slashes (e.g. BAAI/bge-large) but collapses whitespace', () => { |
| 177 | const chunk = { text: 'x', path: 'a.md' }; |
| 178 | const tagged = computeChunkContentHashTagged(chunk, { |
| 179 | provider: 'deepinfra', |
| 180 | model: ' BAAI/bge-large-en-v1.5 ', |
| 181 | }); |
| 182 | assert.match(tagged, /^v1:deepinfra:BAAI\/bge-large-en-v1\.5:[0-9a-f]{32}$/); |
| 183 | }); |
| 184 | |
| 185 | it('throws if embeddingConfig is missing — silent fallback would re-introduce the silent-corruption bug', () => { |
| 186 | const chunk = { text: 'x', path: 'a.md' }; |
| 187 | assert.throws( |
| 188 | () => computeChunkContentHashTagged(chunk), |
| 189 | /embeddingConfig is required/, |
| 190 | ); |
| 191 | assert.throws( |
| 192 | () => computeChunkContentHashTagged(chunk, null), |
| 193 | /embeddingConfig is required/, |
| 194 | ); |
| 195 | }); |
| 196 | |
| 197 | it('throws on missing/empty provider or model (caller bug surfaces loudly)', () => { |
| 198 | const chunk = { text: 'x', path: 'a.md' }; |
| 199 | assert.throws( |
| 200 | () => computeChunkContentHashTagged(chunk, { model: 'm' }), |
| 201 | /provider must be a non-empty string/, |
| 202 | ); |
| 203 | assert.throws( |
| 204 | () => computeChunkContentHashTagged(chunk, { provider: '', model: 'm' }), |
| 205 | /provider must be a non-empty string/, |
| 206 | ); |
| 207 | assert.throws( |
| 208 | () => computeChunkContentHashTagged(chunk, { provider: 'deepinfra' }), |
| 209 | /model must be a non-empty string/, |
| 210 | ); |
| 211 | assert.throws( |
| 212 | () => computeChunkContentHashTagged(chunk, { provider: 'deepinfra', model: ' ' }), |
| 213 | /model must be a non-empty string/, |
| 214 | ); |
| 215 | }); |
| 216 | }); |
File History
2 commits
sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd
feat(calendar): enforce agent context tiers in retrieval AP…
Human
minor
⚠
2 days ago
sha256:9103f98c89257ed2b01c237cea895dabb3e85ea337dccb1161c175e4422355b6
docs: accept Calendar Events v0 spec with Phase 0 security …
Human
2 days ago