chunk-content-hash.mjs
sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd
feat(calendar): enforce agent context tiers in retrieval AP…
Human
minor
⚠ breaking
1 day ago
| 1 | /** |
| 2 | * Stable content hash for an indexed chunk. Used by `hub/bridge/server.mjs` |
| 3 | * `POST /api/v1/index` to skip re-embedding chunks whose text and search-relevant |
| 4 | * metadata did not change since the last successful index. |
| 5 | * |
| 6 | * The hash MUST be stable across processes (same canonical input → same digest) |
| 7 | * because we persist it in the sqlite-vec auxiliary column `+content_hash` and |
| 8 | * compare against newly-built chunks on every re-index. Bumping the algorithm or |
| 9 | * the canonical-form ordering breaks every cache hit and forces a full re-embed. |
| 10 | * |
| 11 | * Why include metadata (path/tags/project/date/...) — not just text: |
| 12 | * The bridge upserts the chunk row with these fields as vec0 metadata; if any |
| 13 | * of them change we must re-write the row even though the embedding is the |
| 14 | * same. Hashing them together means "skip" is always safe (vector AND payload |
| 15 | * are still correct). |
| 16 | * |
| 17 | * Truncation: SHA-256 → first 32 hex chars (128 bits). Birthday-collision |
| 18 | * probability for ~10^9 chunks per vault is ≈ 10^-21, far below the noise floor |
| 19 | * of any other failure mode (network blip, embedding API hiccup). |
| 20 | */ |
| 21 | |
| 22 | import crypto from 'crypto'; |
| 23 | |
| 24 | /** |
| 25 | * Compute the canonical content hash for a chunk. |
| 26 | * |
| 27 | * @param {{ |
| 28 | * text: string, |
| 29 | * path: string, |
| 30 | * project?: string|null, |
| 31 | * tags?: string[]|null, |
| 32 | * date?: string|null, |
| 33 | * causal_chain_id?: string|null, |
| 34 | * entity?: string[]|null, |
| 35 | * episode_id?: string|null, |
| 36 | * }} chunk - As produced by `lib/chunk.mjs:chunkNote`. `text` and `path` are required; |
| 37 | * the rest are optional and default to null/[] so chunks built without them in different |
| 38 | * parts of the codebase (e.g. bridge vs CLI) hash identically when their text+path match. |
| 39 | * @returns {string} 32-char lowercase hex (128 bits). |
| 40 | */ |
| 41 | export function computeChunkContentHash(chunk) { |
| 42 | if (chunk == null || typeof chunk !== 'object') { |
| 43 | throw new TypeError('computeChunkContentHash: chunk is required'); |
| 44 | } |
| 45 | if (typeof chunk.text !== 'string') { |
| 46 | throw new TypeError('computeChunkContentHash: chunk.text must be a string'); |
| 47 | } |
| 48 | if (typeof chunk.path !== 'string') { |
| 49 | throw new TypeError('computeChunkContentHash: chunk.path must be a string'); |
| 50 | } |
| 51 | // Canonical form: explicit field order, sorted arrays, null for missing values. |
| 52 | // JSON.stringify with explicit object structure (not the chunk itself) so that future |
| 53 | // additional chunk fields (e.g. embedded summaries) do not silently invalidate the cache. |
| 54 | const tags = Array.isArray(chunk.tags) ? chunk.tags.slice().sort() : []; |
| 55 | const entity = Array.isArray(chunk.entity) ? chunk.entity.slice().sort() : []; |
| 56 | const meta = JSON.stringify({ |
| 57 | p: chunk.path, |
| 58 | pr: chunk.project ?? null, |
| 59 | t: tags, |
| 60 | d: chunk.date ?? null, |
| 61 | cc: chunk.causal_chain_id ?? null, |
| 62 | e: entity, |
| 63 | ep: chunk.episode_id ?? null, |
| 64 | }); |
| 65 | const h = crypto.createHash('sha256'); |
| 66 | h.update(chunk.text); |
| 67 | h.update('\x00'); |
| 68 | h.update(meta); |
| 69 | return h.digest('hex').slice(0, 32); |
| 70 | } |
| 71 | |
| 72 | /** |
| 73 | * Versioned hash algorithm tag stored alongside each row. If we ever change the |
| 74 | * canonical form (e.g. add a new metadata field or swap algorithm), bumping the |
| 75 | * version invalidates every cached row and forces a one-time full re-embed |
| 76 | * without us having to flip a separate "rebuild" flag everywhere. |
| 77 | */ |
| 78 | export const CHUNK_CONTENT_HASH_VERSION = 'v1'; |
| 79 | |
| 80 | /** |
| 81 | * Normalize a provider name for the hash prefix: lowercased, restricted to |
| 82 | * `[a-z0-9_-]`. Bridge / CLI both pass `getBridgeEmbeddingConfig().provider` |
| 83 | * (e.g. `'deepinfra'`, `'openai'`, `'voyage'`, `'ollama'`); a typo'd or extended |
| 84 | * provider name still produces a stable, comparable prefix here. |
| 85 | * |
| 86 | * @param {string} provider |
| 87 | * @returns {string} |
| 88 | */ |
| 89 | function normalizeProviderForHash(provider) { |
| 90 | return String(provider) |
| 91 | .trim() |
| 92 | .toLowerCase() |
| 93 | .replace(/[^a-z0-9_-]/g, '_'); |
| 94 | } |
| 95 | |
| 96 | /** |
| 97 | * Normalize a model name for the hash prefix. Models legitimately contain `/` |
| 98 | * (e.g. `BAAI/bge-large-en-v1.5`) and `.` so we DO NOT alphanumeric-strip them, |
| 99 | * but we collapse whitespace + control characters that would corrupt log lines. |
| 100 | * |
| 101 | * @param {string} model |
| 102 | * @returns {string} |
| 103 | */ |
| 104 | function normalizeModelForHash(model) { |
| 105 | return String(model) |
| 106 | .trim() |
| 107 | .replace(/[\s\r\n\x00]+/g, '_'); |
| 108 | } |
| 109 | |
| 110 | /** |
| 111 | * Compose the value persisted in the `+content_hash` column. |
| 112 | * |
| 113 | * Format: `"v1:<provider>:<model>:<32-hex>"` — the provider + model are part of |
| 114 | * the prefix so a future provider/model swap (e.g. DeepInfra BGE-large 1024 → |
| 115 | * DeepInfra BGE-m3 1024 — same dimension, different vector space) automatically |
| 116 | * invalidates every cached row. Without this, a same-dimension model swap would |
| 117 | * silently keep stale vectors and corrupt Meaning search results with no error. |
| 118 | * |
| 119 | * Including the prefix in the stored hash keeps it human-readable: an operator |
| 120 | * can grep `+content_hash` rows and immediately see which provider/model produced |
| 121 | * each vector, which makes any future cache-debugging session trivial. |
| 122 | * |
| 123 | * @param {{ text: string, path: string }} chunk |
| 124 | * @param {{ provider: string, model: string }} embeddingConfig - The provider + model |
| 125 | * the bridge / CLI is about to call. Both fields are required: omitting them is |
| 126 | * a caller bug (would re-introduce the same-dimension silent corruption this |
| 127 | * prefix prevents) and is rejected loudly with a TypeError. |
| 128 | * @returns {string} `"v1:<provider>:<model>:<32-hex>"` |
| 129 | */ |
| 130 | export function computeChunkContentHashTagged(chunk, embeddingConfig) { |
| 131 | if (embeddingConfig == null || typeof embeddingConfig !== 'object') { |
| 132 | throw new TypeError( |
| 133 | 'computeChunkContentHashTagged: embeddingConfig is required ({ provider, model }). ' + |
| 134 | 'Without provider+model in the prefix, a same-dimension model swap would silently keep stale vectors.', |
| 135 | ); |
| 136 | } |
| 137 | if (typeof embeddingConfig.provider !== 'string' || embeddingConfig.provider.trim() === '') { |
| 138 | throw new TypeError( |
| 139 | 'computeChunkContentHashTagged: embeddingConfig.provider must be a non-empty string', |
| 140 | ); |
| 141 | } |
| 142 | if (typeof embeddingConfig.model !== 'string' || embeddingConfig.model.trim() === '') { |
| 143 | throw new TypeError( |
| 144 | 'computeChunkContentHashTagged: embeddingConfig.model must be a non-empty string', |
| 145 | ); |
| 146 | } |
| 147 | const provider = normalizeProviderForHash(embeddingConfig.provider); |
| 148 | const model = normalizeModelForHash(embeddingConfig.model); |
| 149 | return `${CHUNK_CONTENT_HASH_VERSION}:${provider}:${model}:${computeChunkContentHash(chunk)}`; |
| 150 | } |
File History
2 commits
sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd
feat(calendar): enforce agent context tiers in retrieval AP…
Human
minor
⚠
1 day ago
sha256:9103f98c89257ed2b01c237cea895dabb3e85ea337dccb1161c175e4422355b6
docs: accept Calendar Events v0 spec with Phase 0 security …
Human
1 day ago