chunk-content-hash.test.mjs
216 lines 8.4 KB
Raw
sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd feat(calendar): enforce agent context tiers in retrieval AP… Human minor ⚠ breaking 2 days ago
1 /**
2 * Tests for `lib/chunk-content-hash.mjs`. The bridge will compare these hashes
3 * against the `+content_hash` column in sqlite-vec to decide whether to skip
4 * embedding a chunk on re-index, so the contract that matters here is:
5 * 1. Same logical input → same digest, deterministically, across processes.
6 * 2. Different text OR different search-relevant metadata → different digest.
7 * 3. Tag/entity array order is not significant (chunks built in different
8 * orders must hash identically).
9 * 4. Missing/null/undefined for optional fields hash identically (so a chunk
10 * with `tags: undefined` matches one with `tags: []`).
11 * 5. Tagged variant carries the version prefix `v1:` so future algo bumps
12 * can be detected on read without touching every callsite.
13 * 6. Bad input throws (so a bridge bug surfaces loudly, not silently).
14 */
15
16 import { describe, it } from 'node:test';
17 import assert from 'node:assert/strict';
18 import {
19 computeChunkContentHash,
20 computeChunkContentHashTagged,
21 CHUNK_CONTENT_HASH_VERSION,
22 } from '../lib/chunk-content-hash.mjs';
23
24 describe('computeChunkContentHash', () => {
25 it('is deterministic and returns 32 lowercase hex chars (128 bits)', () => {
26 const chunk = { text: 'hello world', path: 'notes/a.md' };
27 const a = computeChunkContentHash(chunk);
28 const b = computeChunkContentHash(chunk);
29 assert.equal(a, b);
30 assert.match(a, /^[0-9a-f]{32}$/);
31 });
32
33 it('changes when text changes', () => {
34 const a = computeChunkContentHash({ text: 'one', path: 'a.md' });
35 const b = computeChunkContentHash({ text: 'two', path: 'a.md' });
36 assert.notEqual(a, b);
37 });
38
39 it('changes when path changes (since path is part of the search-relevant payload)', () => {
40 const a = computeChunkContentHash({ text: 'same', path: 'a.md' });
41 const b = computeChunkContentHash({ text: 'same', path: 'b.md' });
42 assert.notEqual(a, b);
43 });
44
45 it('changes when project, date, or tags change (so metadata edits invalidate the cache)', () => {
46 const base = { text: 'x', path: 'a.md' };
47 const baseHash = computeChunkContentHash(base);
48 assert.notEqual(computeChunkContentHash({ ...base, project: 'p1' }), baseHash);
49 assert.notEqual(computeChunkContentHash({ ...base, date: '2026-05-01' }), baseHash);
50 assert.notEqual(computeChunkContentHash({ ...base, tags: ['x'] }), baseHash);
51 assert.notEqual(
52 computeChunkContentHash({ ...base, causal_chain_id: 'c1' }),
53 baseHash,
54 );
55 assert.notEqual(
56 computeChunkContentHash({ ...base, episode_id: 'ep1' }),
57 baseHash,
58 );
59 assert.notEqual(computeChunkContentHash({ ...base, entity: ['e1'] }), baseHash);
60 });
61
62 it('treats tag/entity arrays as sets (order-independent)', () => {
63 const a = computeChunkContentHash({
64 text: 'x',
65 path: 'a.md',
66 tags: ['b', 'a'],
67 entity: ['z', 'y'],
68 });
69 const b = computeChunkContentHash({
70 text: 'x',
71 path: 'a.md',
72 tags: ['a', 'b'],
73 entity: ['y', 'z'],
74 });
75 assert.equal(a, b);
76 });
77
78 it('treats undefined/null/missing equivalently for optional fields', () => {
79 const minimal = { text: 'x', path: 'a.md' };
80 const explicitNulls = {
81 text: 'x',
82 path: 'a.md',
83 project: null,
84 tags: null,
85 date: null,
86 causal_chain_id: null,
87 entity: null,
88 episode_id: null,
89 };
90 const explicitEmpty = {
91 text: 'x',
92 path: 'a.md',
93 project: undefined,
94 tags: [],
95 date: undefined,
96 causal_chain_id: undefined,
97 entity: [],
98 episode_id: undefined,
99 };
100 const h1 = computeChunkContentHash(minimal);
101 const h2 = computeChunkContentHash(explicitNulls);
102 const h3 = computeChunkContentHash(explicitEmpty);
103 assert.equal(h1, h2);
104 assert.equal(h1, h3);
105 });
106
107 it('throws on missing chunk', () => {
108 assert.throws(() => computeChunkContentHash(null), /chunk is required/);
109 assert.throws(() => computeChunkContentHash(undefined), /chunk is required/);
110 });
111
112 it('throws on missing/wrong-type text or path (bridge bug must surface loudly)', () => {
113 assert.throws(
114 () => computeChunkContentHash({ path: 'a.md' }),
115 /chunk\.text must be a string/,
116 );
117 assert.throws(
118 () => computeChunkContentHash({ text: 'x' }),
119 /chunk\.path must be a string/,
120 );
121 assert.throws(
122 () => computeChunkContentHash({ text: 123, path: 'a.md' }),
123 /chunk\.text must be a string/,
124 );
125 });
126 });
127
128 describe('computeChunkContentHashTagged', () => {
129 const cfg = { provider: 'deepinfra', model: 'BAAI/bge-large-en-v1.5' };
130
131 it('format is "v<N>:<provider>:<model>:<32-hex>" with provider lowercased', () => {
132 const tagged = computeChunkContentHashTagged({ text: 'hello', path: 'a.md' }, cfg);
133 const parts = tagged.split(':');
134 assert.equal(parts[0], 'v1');
135 assert.equal(parts[1], 'deepinfra');
136 assert.equal(parts[2], 'BAAI/bge-large-en-v1.5');
137 assert.match(parts[3], /^[0-9a-f]{32}$/);
138 });
139
140 it('current version is v1', () => {
141 assert.equal(CHUNK_CONTENT_HASH_VERSION, 'v1');
142 });
143
144 it('two equivalent chunks under same provider+model produce equal tagged hashes', () => {
145 const a = computeChunkContentHashTagged({ text: 'x', path: 'a.md', tags: ['b', 'a'] }, cfg);
146 const b = computeChunkContentHashTagged({ text: 'x', path: 'a.md', tags: ['a', 'b'] }, cfg);
147 assert.equal(a, b);
148 });
149
150 it('changing provider invalidates the cache (different prefix)', () => {
151 const chunk = { text: 'same', path: 'a.md' };
152 const a = computeChunkContentHashTagged(chunk, { provider: 'openai', model: 'text-embedding-3-small' });
153 const b = computeChunkContentHashTagged(chunk, { provider: 'deepinfra', model: 'BAAI/bge-large-en-v1.5' });
154 assert.notEqual(a, b, 'same chunk under different providers must hash differently');
155 });
156
157 it('changing model (same provider, same dimension) invalidates the cache', () => {
158 // The whole point of putting model in the prefix: BGE-large (1024) → BGE-m3 (1024)
159 // is a same-dimension swap that the dimension check cannot catch. Without model in the
160 // hash, every chunk would be a cache hit and we would silently keep stale vectors.
161 const chunk = { text: 'same', path: 'a.md' };
162 const a = computeChunkContentHashTagged(chunk, { provider: 'deepinfra', model: 'BAAI/bge-large-en-v1.5' });
163 const b = computeChunkContentHashTagged(chunk, { provider: 'deepinfra', model: 'BAAI/bge-m3' });
164 assert.notEqual(a, b, 'same chunk under different models must hash differently');
165 });
166
167 it('provider is lowercased + alphanumeric-stripped (deterministic across casing/typos)', () => {
168 const chunk = { text: 'x', path: 'a.md' };
169 const a = computeChunkContentHashTagged(chunk, { provider: 'DeepInfra', model: 'm' });
170 const b = computeChunkContentHashTagged(chunk, { provider: 'deepinfra', model: 'm' });
171 const c = computeChunkContentHashTagged(chunk, { provider: 'deepinfra ', model: 'm' });
172 assert.equal(a, b);
173 assert.equal(a, c);
174 });
175
176 it('model preserves slashes (e.g. BAAI/bge-large) but collapses whitespace', () => {
177 const chunk = { text: 'x', path: 'a.md' };
178 const tagged = computeChunkContentHashTagged(chunk, {
179 provider: 'deepinfra',
180 model: ' BAAI/bge-large-en-v1.5 ',
181 });
182 assert.match(tagged, /^v1:deepinfra:BAAI\/bge-large-en-v1\.5:[0-9a-f]{32}$/);
183 });
184
185 it('throws if embeddingConfig is missing — silent fallback would re-introduce the silent-corruption bug', () => {
186 const chunk = { text: 'x', path: 'a.md' };
187 assert.throws(
188 () => computeChunkContentHashTagged(chunk),
189 /embeddingConfig is required/,
190 );
191 assert.throws(
192 () => computeChunkContentHashTagged(chunk, null),
193 /embeddingConfig is required/,
194 );
195 });
196
197 it('throws on missing/empty provider or model (caller bug surfaces loudly)', () => {
198 const chunk = { text: 'x', path: 'a.md' };
199 assert.throws(
200 () => computeChunkContentHashTagged(chunk, { model: 'm' }),
201 /provider must be a non-empty string/,
202 );
203 assert.throws(
204 () => computeChunkContentHashTagged(chunk, { provider: '', model: 'm' }),
205 /provider must be a non-empty string/,
206 );
207 assert.throws(
208 () => computeChunkContentHashTagged(chunk, { provider: 'deepinfra' }),
209 /model must be a non-empty string/,
210 );
211 assert.throws(
212 () => computeChunkContentHashTagged(chunk, { provider: 'deepinfra', model: ' ' }),
213 /model must be a non-empty string/,
214 );
215 });
216 });
File History 2 commits
sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd feat(calendar): enforce agent context tiers in retrieval AP… Human minor 2 days ago
sha256:9103f98c89257ed2b01c237cea895dabb3e85ea337dccb1161c175e4422355b6 docs: accept Calendar Events v0 spec with Phase 0 security … Human 2 days ago