index-partition.mjs
64 lines 2.5 KB
Raw
sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd feat(calendar): enforce agent context tiers in retrieval AP… Human minor ⚠ breaking 1 day ago
1 /**
2 * Pure partition step for `hub/bridge/server.mjs POST /api/v1/index`'s incremental
3 * cache flow. Given the chunks built from the canister export (each tagged with a
4 * versioned content hash) and the `(chunk_id → content_hash)` Map persisted by the
5 * sqlite-vec / Qdrant store from the previous successful index, decide:
6 *
7 * - which chunks can be **skipped** (hash matches: vector + payload already correct);
8 * - which chunks must be **embedded** (new chunk, or text/metadata changed);
9 * - which prior chunk_ids are **orphans** (present in the store but absent from
10 * the current export, e.g. note deleted or path renamed).
11 *
12 * Pulled out of the index handler so the partition contract has unit tests without
13 * spinning up the canister, embedding provider, or sqlite-vec backend.
14 *
15 * @typedef {{
16 * chunk: { id: string, text: string, path: string, [k: string]: any },
17 * storeId: string,
18 * contentHash: string,
19 * }} ChunkWithHash
20 *
21 * @param {ChunkWithHash[]} chunksWithHash - Output of building chunks for the current export.
22 * @param {Map<string, string>|null|undefined} existingHashes - From `store.getChunkHashes(vaultId)`.
23 * Treated as empty when null/undefined (e.g. backend without the surface).
24 * @returns {{
25 * toEmbed: ChunkWithHash[],
26 * skippedCachedCount: number,
27 * orphanIds: string[],
28 * presentChunkIds: Set<string>,
29 * }}
30 */
31 export function partitionChunksForReindex(chunksWithHash, existingHashes) {
32 if (!Array.isArray(chunksWithHash)) {
33 throw new TypeError('partitionChunksForReindex: chunksWithHash must be an array');
34 }
35 const cache = existingHashes instanceof Map ? existingHashes : new Map();
36 const toEmbed = [];
37 let skippedCachedCount = 0;
38 const presentChunkIds = new Set();
39 for (const item of chunksWithHash) {
40 if (
41 !item ||
42 typeof item.storeId !== 'string' ||
43 item.storeId === '' ||
44 typeof item.contentHash !== 'string' ||
45 item.contentHash === ''
46 ) {
47 throw new TypeError(
48 'partitionChunksForReindex: each item must have non-empty storeId and contentHash',
49 );
50 }
51 presentChunkIds.add(item.storeId);
52 const prior = cache.get(item.storeId);
53 if (prior && prior === item.contentHash) {
54 skippedCachedCount++;
55 continue;
56 }
57 toEmbed.push(item);
58 }
59 const orphanIds = [];
60 for (const cid of cache.keys()) {
61 if (!presentChunkIds.has(cid)) orphanIds.push(cid);
62 }
63 return { toEmbed, skippedCachedCount, orphanIds, presentChunkIds };
64 }
File History 2 commits
sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd feat(calendar): enforce agent context tiers in retrieval AP… Human minor 1 day ago
sha256:9103f98c89257ed2b01c237cea895dabb3e85ea337dccb1161c175e4422355b6 docs: accept Calendar Events v0 spec with Phase 0 security … Human 1 day ago