vector-store.mjs
327 lines 12.0 KB
Raw
sha256:8d46372e39d2d5a54fd93a8b1c27922fe0d9b22a72197345f1d2c71701cc4ce2 feat(auth): persistent login system + C7 session introspection Human minor ⚠ breaking 16 days ago
1 /**
2 * Vector store abstraction. Qdrant backend for indexer (Phase 2); search in Phase 3.
3 * SPEC §5: metadata (path, project, tags, date); stable chunk id for upsert.
4 */
5
6 import crypto from 'crypto';
7 import { QdrantClient } from '@qdrant/js-client-rest';
8 import { MAX_VECTOR_KNN } from './vector-knn-limit.mjs';
9
10 const COLLECTION_NAME = 'knowtation';
11
12 /**
13 * Deterministic string to positive integer for Qdrant point id (upsert by same id = no duplicate).
14 */
15 function pointIdFromChunkId(chunkId) {
16 const buf = crypto.createHash('sha256').update(chunkId).digest();
17 return buf.readUInt32BE(0);
18 }
19
20 /**
21 * Normalize date to YYYY-MM-DD for range comparison (Qdrant accepts string range).
22 * @param {string} d - ISO 8601 or YYYY-MM-DD
23 * @returns {string}
24 */
25 function dateToComparable(d) {
26 if (d == null || typeof d !== 'string') return '';
27 const s = d.trim().slice(0, 10);
28 return s || '';
29 }
30
31 /**
32 * Build Qdrant filter for project, tag, date range, vault_id, and optional chain/entity/episode.
33 * Folder (path prefix) is applied as post-filter.
34 * @param {{ project?: string, tag?: string, since?: string, until?: string, chain?: string, entity?: string, episode?: string, vault_id?: string }} filters
35 * @returns {{ must?: object[] }} or empty object
36 */
37 function buildFilter(filters) {
38 const must = [];
39 if (filters.vault_id != null && filters.vault_id !== '') {
40 must.push({ key: 'vault_id', match: { value: filters.vault_id } });
41 }
42 if (filters.project != null && filters.project !== '') {
43 must.push({ key: 'project', match: { value: filters.project } });
44 }
45 if (filters.tag != null && filters.tag !== '') {
46 must.push({ key: 'tags', match: { any: [filters.tag] } });
47 }
48 // Date range (since/until) applied as post-filter for compatibility with string payload
49 if (filters.chain != null && filters.chain !== '') {
50 must.push({ key: 'causal_chain_id', match: { value: filters.chain } });
51 }
52 if (filters.entity != null && filters.entity !== '') {
53 must.push({ key: 'entity', match: { any: [filters.entity] } });
54 }
55 if (filters.episode != null && filters.episode !== '') {
56 must.push({ key: 'episode_id', match: { value: filters.episode } });
57 }
58 return must.length ? { must } : {};
59 }
60
61 /**
62 * Create vector store (Qdrant or sqlite-vec). Validates config.
63 * @param {{ qdrant_url?: string, vector_store?: string, data_dir?: string }} config
64 * @returns {{ ensureCollection: (dimension: number) => Promise<void>, upsert: (points) => Promise<void>, search: (queryVector: number[], options) => Promise<{ path, score, project, tags, date, text }[]>, count?: () => Promise<number> }}
65 */
66 export async function createVectorStore(config) {
67 const store = config.vector_store || 'qdrant';
68 if (store === 'sqlite-vec') {
69 const { createSqliteVectorStore } = await import('./vector-store-sqlite.mjs');
70 return createSqliteVectorStore(config);
71 }
72 if (store !== 'qdrant') {
73 throw new Error(`Vector store "${store}" is not implemented. Use vector_store: qdrant or sqlite-vec.`);
74 }
75 const url = config.qdrant_url;
76 if (!url || typeof url !== 'string') {
77 throw new Error('Qdrant requires qdrant_url in config or QDRANT_URL env.');
78 }
79
80 const client = new QdrantClient({ url: url.replace(/\/$/, '') });
81
82 return {
83 async ensureCollection(dimension) {
84 try {
85 await client.getCollection(COLLECTION_NAME);
86 return;
87 } catch (_) {
88 // Collection does not exist; create it.
89 }
90 await client.createCollection(COLLECTION_NAME, {
91 vectors: {
92 size: dimension,
93 distance: 'Cosine',
94 },
95 });
96 },
97
98 async upsert(points) {
99 if (!points.length) return;
100 const payload = points.map((p) => ({
101 id: pointIdFromChunkId(p.id),
102 vector: p.vector,
103 payload: {
104 chunk_id: p.id,
105 path: p.path,
106 vault_id: p.vault_id ?? 'default',
107 project: p.project ?? null,
108 tags: p.tags ?? [],
109 date: p.date ?? null,
110 text: p.text ?? null,
111 causal_chain_id: p.causal_chain_id ?? null,
112 entity: p.entity ?? [],
113 episode_id: p.episode_id ?? null,
114 // `content_hash` is the bridge-side cache key for skip-re-embed; mirror the
115 // sqlite-vec backend so both stores expose the same getChunkHashes surface.
116 content_hash: p.content_hash ?? null,
117 },
118 }));
119 await client.upsert(COLLECTION_NAME, {
120 wait: true,
121 points: payload,
122 });
123 },
124
125 /**
126 * Search by vector. Returns scored points with payload (path, project, tags, date, text).
127 * Folder filter (path prefix) and order are applied as post-filter/post-sort.
128 * @param {number[]} queryVector
129 * @param {{ limit?: number, vault_id?: string, project?: string, tag?: string, folder?: string, since?: string, until?: string, chain?: string, entity?: string, episode?: string, order?: string }} options
130 * @returns {Promise<{ path: string, score: number, project: string|null, tags: string[], date: string|null, text: string|null }[]>}
131 */
132 async search(queryVector, options = {}) {
133 const limit = Math.min(options.limit ?? 10, 100);
134 const filter = buildFilter({
135 vault_id: options.vault_id ?? undefined,
136 project: options.project ?? undefined,
137 tag: options.tag ?? undefined,
138 since: options.since ?? undefined,
139 until: options.until ?? undefined,
140 chain: options.chain ?? undefined,
141 entity: options.entity ?? undefined,
142 episode: options.episode ?? undefined,
143 });
144 let scored;
145 try {
146 const pathPrefixFilter = options.folder && String(options.folder).trim() !== '';
147 const knnLimit = Math.min(
148 pathPrefixFilter
149 ? Math.min(Math.max(limit * 100, 2000), 10000)
150 : options.since || options.until
151 ? Math.min(limit * 3, 300)
152 : limit,
153 MAX_VECTOR_KNN,
154 );
155 scored = await client.search(COLLECTION_NAME, {
156 vector: queryVector,
157 limit: knnLimit,
158 filter: Object.keys(filter).length ? filter : undefined,
159 with_payload: true,
160 with_vector: false,
161 });
162 } catch (e) {
163 if (e.message && (e.message.includes('Not found') || e.message.includes('404'))) {
164 throw new Error('Vector store collection not found. Run "knowtation index" first to index your vault.');
165 }
166 throw e;
167 }
168 const scoredList = Array.isArray(scored)
169 ? scored
170 : scored && typeof scored === 'object' && Array.isArray(scored.points)
171 ? scored.points
172 : [];
173 let hits = scoredList.map((p) => {
174 const pl = p.payload || {};
175 const rawSc = p.score;
176 let score = typeof rawSc === 'number' && Number.isFinite(rawSc) ? rawSc : Number(rawSc);
177 if (!Number.isFinite(score)) score = 0;
178 return {
179 path: pl.path ?? '',
180 score,
181 project: pl.project ?? null,
182 tags: Array.isArray(pl.tags) ? pl.tags : [],
183 date: pl.date ?? null,
184 text: pl.text ?? null,
185 };
186 });
187 const folder = options.folder;
188 if (folder && typeof folder === 'string') {
189 const prefix = folder.replace(/\\/g, '/').replace(/\/$/, '') + '/';
190 const exact = folder.replace(/\\/g, '/').replace(/\/$/, '');
191 hits = hits.filter((h) => h.path === exact || h.path.startsWith(prefix));
192 }
193 const since = dateToComparable(options.since);
194 const until = dateToComparable(options.until);
195 if (since || until) {
196 hits = hits.filter((h) => {
197 const d = dateToComparable(h.date);
198 if (!d) return false;
199 if (since && d < since) return false;
200 if (until && d > until) return false;
201 return true;
202 });
203 }
204 const order = options.order;
205 if (order === 'date-asc') {
206 hits.sort((a, b) => (a.date || '').localeCompare(b.date || ''));
207 } else if (order === 'date') {
208 hits.sort((a, b) => (b.date || '').localeCompare(a.date || ''));
209 }
210 hits = hits.slice(0, limit);
211 return hits;
212 },
213
214 /**
215 * Return total point count for the collection (for count-only or existence check).
216 * @returns {Promise<number>}
217 */
218 async count() {
219 try {
220 const result = await client.count(COLLECTION_NAME);
221 return result?.count ?? 0;
222 } catch (e) {
223 if (e.message && (e.message.includes('Not found') || e.message.includes('404'))) {
224 return 0;
225 }
226 throw e;
227 }
228 },
229
230 /**
231 * Remove all points tagged with this Hub vault_id (Phase 15 multi-vault).
232 * @param {string} vaultId
233 * @returns {Promise<number>} 1 if delete ran; 0 if collection missing
234 */
235 async deleteByVaultId(vaultId) {
236 if (vaultId == null || vaultId === '') return 0;
237 const vid = String(vaultId).trim();
238 if (!vid) return 0;
239 const filter = buildFilter({ vault_id: vid });
240 if (!filter.must || filter.must.length === 0) return 0;
241 try {
242 await client.delete(COLLECTION_NAME, { wait: true, filter });
243 return 1;
244 } catch (e) {
245 if (e.message && (e.message.includes('Not found') || e.message.includes('404'))) {
246 return 0;
247 }
248 throw e;
249 }
250 },
251
252 /**
253 * Mirror of `vector-store-sqlite.getChunkHashes`: returns Map<chunk_id, content_hash>
254 * for the named vault. Implemented via Qdrant `scroll` (paginated) since `count` does
255 * not return payload. A null/empty content_hash is treated as cache miss and skipped.
256 *
257 * @param {string} vaultId
258 * @returns {Promise<Map<string, string>>}
259 */
260 async getChunkHashes(vaultId) {
261 const out = new Map();
262 if (vaultId == null || vaultId === '') return out;
263 const vid = String(vaultId).trim();
264 if (!vid) return out;
265 const filter = buildFilter({ vault_id: vid });
266 if (!filter.must || filter.must.length === 0) return out;
267 try {
268 let nextOffset = undefined;
269 const PAGE = 256;
270 // Bound iterations defensively (vault can have huge note count, but a runaway
271 // server would block the index handler past Netlify's sync cap).
272 for (let i = 0; i < 4096; i++) {
273 const page = await client.scroll(COLLECTION_NAME, {
274 filter,
275 with_payload: true,
276 with_vector: false,
277 limit: PAGE,
278 offset: nextOffset,
279 });
280 const points = Array.isArray(page?.points) ? page.points : [];
281 for (const p of points) {
282 const cid = p?.payload?.chunk_id;
283 const ch = p?.payload?.content_hash;
284 if (typeof cid !== 'string' || cid === '') continue;
285 if (typeof ch !== 'string' || ch === '') continue;
286 out.set(cid, ch);
287 }
288 nextOffset = page?.next_page_offset;
289 if (!nextOffset || points.length === 0) break;
290 }
291 } catch (e) {
292 if (e.message && (e.message.includes('Not found') || e.message.includes('404'))) {
293 return out;
294 }
295 throw e;
296 }
297 return out;
298 },
299
300 /**
301 * Mirror of `vector-store-sqlite.deleteByChunkIds`: takes string chunk_ids and
302 * deletes the corresponding integer points. Used by the bridge to remove orphans
303 * (chunks present in the previous index but absent from the current export).
304 *
305 * @param {string[]} chunkIds
306 * @returns {Promise<number>} count requested for delete (Qdrant does not return per-id success)
307 */
308 async deleteByChunkIds(chunkIds) {
309 if (!Array.isArray(chunkIds) || chunkIds.length === 0) return 0;
310 const ids = chunkIds
311 .filter((cid) => typeof cid === 'string' && cid !== '')
312 .map((cid) => pointIdFromChunkId(cid));
313 if (ids.length === 0) return 0;
314 try {
315 await client.delete(COLLECTION_NAME, { wait: true, points: ids });
316 return ids.length;
317 } catch (e) {
318 if (e.message && (e.message.includes('Not found') || e.message.includes('404'))) {
319 return 0;
320 }
321 throw e;
322 }
323 },
324 };
325 }
326
327 export { COLLECTION_NAME };
File History 2 commits
sha256:8d46372e39d2d5a54fd93a8b1c27922fe0d9b22a72197345f1d2c71701cc4ce2 feat(auth): persistent login system + C7 session introspection Human minor 16 days ago