chunk-content-hash.mjs
150 lines 6.3 KB
Raw
sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd feat(calendar): enforce agent context tiers in retrieval AP… Human minor ⚠ breaking 1 day ago
1 /**
2 * Stable content hash for an indexed chunk. Used by `hub/bridge/server.mjs`
3 * `POST /api/v1/index` to skip re-embedding chunks whose text and search-relevant
4 * metadata did not change since the last successful index.
5 *
6 * The hash MUST be stable across processes (same canonical input → same digest)
7 * because we persist it in the sqlite-vec auxiliary column `+content_hash` and
8 * compare against newly-built chunks on every re-index. Bumping the algorithm or
9 * the canonical-form ordering breaks every cache hit and forces a full re-embed.
10 *
11 * Why include metadata (path/tags/project/date/...) — not just text:
12 * The bridge upserts the chunk row with these fields as vec0 metadata; if any
13 * of them change we must re-write the row even though the embedding is the
14 * same. Hashing them together means "skip" is always safe (vector AND payload
15 * are still correct).
16 *
17 * Truncation: SHA-256 → first 32 hex chars (128 bits). Birthday-collision
18 * probability for ~10^9 chunks per vault is ≈ 10^-21, far below the noise floor
19 * of any other failure mode (network blip, embedding API hiccup).
20 */
21
22 import crypto from 'crypto';
23
24 /**
25 * Compute the canonical content hash for a chunk.
26 *
27 * @param {{
28 * text: string,
29 * path: string,
30 * project?: string|null,
31 * tags?: string[]|null,
32 * date?: string|null,
33 * causal_chain_id?: string|null,
34 * entity?: string[]|null,
35 * episode_id?: string|null,
36 * }} chunk - As produced by `lib/chunk.mjs:chunkNote`. `text` and `path` are required;
37 * the rest are optional and default to null/[] so chunks built without them in different
38 * parts of the codebase (e.g. bridge vs CLI) hash identically when their text+path match.
39 * @returns {string} 32-char lowercase hex (128 bits).
40 */
41 export function computeChunkContentHash(chunk) {
42 if (chunk == null || typeof chunk !== 'object') {
43 throw new TypeError('computeChunkContentHash: chunk is required');
44 }
45 if (typeof chunk.text !== 'string') {
46 throw new TypeError('computeChunkContentHash: chunk.text must be a string');
47 }
48 if (typeof chunk.path !== 'string') {
49 throw new TypeError('computeChunkContentHash: chunk.path must be a string');
50 }
51 // Canonical form: explicit field order, sorted arrays, null for missing values.
52 // JSON.stringify with explicit object structure (not the chunk itself) so that future
53 // additional chunk fields (e.g. embedded summaries) do not silently invalidate the cache.
54 const tags = Array.isArray(chunk.tags) ? chunk.tags.slice().sort() : [];
55 const entity = Array.isArray(chunk.entity) ? chunk.entity.slice().sort() : [];
56 const meta = JSON.stringify({
57 p: chunk.path,
58 pr: chunk.project ?? null,
59 t: tags,
60 d: chunk.date ?? null,
61 cc: chunk.causal_chain_id ?? null,
62 e: entity,
63 ep: chunk.episode_id ?? null,
64 });
65 const h = crypto.createHash('sha256');
66 h.update(chunk.text);
67 h.update('\x00');
68 h.update(meta);
69 return h.digest('hex').slice(0, 32);
70 }
71
72 /**
73 * Versioned hash algorithm tag stored alongside each row. If we ever change the
74 * canonical form (e.g. add a new metadata field or swap algorithm), bumping the
75 * version invalidates every cached row and forces a one-time full re-embed
76 * without us having to flip a separate "rebuild" flag everywhere.
77 */
78 export const CHUNK_CONTENT_HASH_VERSION = 'v1';
79
80 /**
81 * Normalize a provider name for the hash prefix: lowercased, restricted to
82 * `[a-z0-9_-]`. Bridge / CLI both pass `getBridgeEmbeddingConfig().provider`
83 * (e.g. `'deepinfra'`, `'openai'`, `'voyage'`, `'ollama'`); a typo'd or extended
84 * provider name still produces a stable, comparable prefix here.
85 *
86 * @param {string} provider
87 * @returns {string}
88 */
89 function normalizeProviderForHash(provider) {
90 return String(provider)
91 .trim()
92 .toLowerCase()
93 .replace(/[^a-z0-9_-]/g, '_');
94 }
95
96 /**
97 * Normalize a model name for the hash prefix. Models legitimately contain `/`
98 * (e.g. `BAAI/bge-large-en-v1.5`) and `.` so we DO NOT alphanumeric-strip them,
99 * but we collapse whitespace + control characters that would corrupt log lines.
100 *
101 * @param {string} model
102 * @returns {string}
103 */
104 function normalizeModelForHash(model) {
105 return String(model)
106 .trim()
107 .replace(/[\s\r\n\x00]+/g, '_');
108 }
109
110 /**
111 * Compose the value persisted in the `+content_hash` column.
112 *
113 * Format: `"v1:<provider>:<model>:<32-hex>"` — the provider + model are part of
114 * the prefix so a future provider/model swap (e.g. DeepInfra BGE-large 1024 →
115 * DeepInfra BGE-m3 1024 — same dimension, different vector space) automatically
116 * invalidates every cached row. Without this, a same-dimension model swap would
117 * silently keep stale vectors and corrupt Meaning search results with no error.
118 *
119 * Including the prefix in the stored hash keeps it human-readable: an operator
120 * can grep `+content_hash` rows and immediately see which provider/model produced
121 * each vector, which makes any future cache-debugging session trivial.
122 *
123 * @param {{ text: string, path: string }} chunk
124 * @param {{ provider: string, model: string }} embeddingConfig - The provider + model
125 * the bridge / CLI is about to call. Both fields are required: omitting them is
126 * a caller bug (would re-introduce the same-dimension silent corruption this
127 * prefix prevents) and is rejected loudly with a TypeError.
128 * @returns {string} `"v1:<provider>:<model>:<32-hex>"`
129 */
130 export function computeChunkContentHashTagged(chunk, embeddingConfig) {
131 if (embeddingConfig == null || typeof embeddingConfig !== 'object') {
132 throw new TypeError(
133 'computeChunkContentHashTagged: embeddingConfig is required ({ provider, model }). ' +
134 'Without provider+model in the prefix, a same-dimension model swap would silently keep stale vectors.',
135 );
136 }
137 if (typeof embeddingConfig.provider !== 'string' || embeddingConfig.provider.trim() === '') {
138 throw new TypeError(
139 'computeChunkContentHashTagged: embeddingConfig.provider must be a non-empty string',
140 );
141 }
142 if (typeof embeddingConfig.model !== 'string' || embeddingConfig.model.trim() === '') {
143 throw new TypeError(
144 'computeChunkContentHashTagged: embeddingConfig.model must be a non-empty string',
145 );
146 }
147 const provider = normalizeProviderForHash(embeddingConfig.provider);
148 const model = normalizeModelForHash(embeddingConfig.model);
149 return `${CHUNK_CONTENT_HASH_VERSION}:${provider}:${model}:${computeChunkContentHash(chunk)}`;
150 }
File History 2 commits
sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd feat(calendar): enforce agent context tiers in retrieval AP… Human minor 1 day ago
sha256:9103f98c89257ed2b01c237cea895dabb3e85ea337dccb1161c175e4422355b6 docs: accept Calendar Events v0 spec with Phase 0 security … Human 1 day ago