vector-store-sqlite.mjs
404 lines 16.5 KB
Raw
sha256:8d46372e39d2d5a54fd93a8b1c27922fe0d9b22a72197345f1d2c71701cc4ce2 feat(auth): persistent login system + C7 session introspection Human minor ⚠ breaking 16 days ago
1 /**
2 * sqlite-vec backend for vector store. Phase 10.
3 * Same interface as Qdrant: ensureCollection(dimension), upsert(points), search(queryVector, options), count().
4 * DB path: data_dir/knowtation_vectors.db
5 */
6
7 import path from 'path';
8 import fs from 'fs';
9 import crypto from 'crypto';
10 import Database from 'better-sqlite3';
11 import * as sqliteVec from 'sqlite-vec';
12 import { MAX_VECTOR_KNN } from './vector-knn-limit.mjs';
13
14 const TABLE_NAME = 'knowtation_vec';
15 const DB_FILENAME = 'knowtation_vectors.db';
16
17 /**
18 * sqlite-vec / better-sqlite3 may return `distance` as number, bigint, string, or (rarely) Buffer.
19 * @param {unknown} raw
20 * @returns {number|null}
21 */
22 function coerceVecDistance(raw) {
23 if (raw == null) return null;
24 if (typeof raw === 'number' && Number.isFinite(raw) && raw >= 0) return raw;
25 if (typeof raw === 'bigint') {
26 const n = Number(raw);
27 return Number.isFinite(n) && n >= 0 ? n : null;
28 }
29 if (typeof raw === 'string') {
30 const n = Number(raw);
31 return Number.isFinite(n) && n >= 0 ? n : null;
32 }
33 if (typeof Buffer !== 'undefined' && Buffer.isBuffer(raw) && raw.length === 8) {
34 const n = raw.readDoubleLE(0);
35 return Number.isFinite(n) && n >= 0 ? n : null;
36 }
37 return null;
38 }
39
40 function pointIdFromChunkId(chunkId) {
41 const buf = crypto.createHash('sha256').update(chunkId).digest();
42 return buf.readUInt32BE(0);
43 }
44
45 function dateToComparable(d) {
46 if (d == null || typeof d !== 'string') return '';
47 return d.trim().slice(0, 10) || '';
48 }
49
50 /**
51 * Create sqlite-vec vector store. Uses data_dir for DB path.
52 * @param {{
53 * vector_store: string,
54 * data_dir: string,
55 * allow_dimension_migration?: boolean,
56 * }} config - `allow_dimension_migration: true` lets `ensureCollection` drop + recreate
57 * the table when the persisted dimension differs from the requested dimension. Bridge
58 * sets this because it owns the data lifecycle (downloads from blob, indexes, uploads
59 * to blob); a provider switch (e.g. OpenAI 1536 → DeepInfra 1024) can only resolve via
60 * a full re-embed. CLI leaves it OFF so an accidental EMBEDDING_PROVIDER swap surfaces
61 * loudly instead of silently corrupting search.
62 * @returns {{ ensureCollection: (dimension: number) => Promise<void>, upsert: (points) => Promise<void>, search: (queryVector: number[], options?: { limit?: number, vault_id?: string, project?: string, tag?: string, folder?: string, since?: string, until?: string, order?: string, chain?: string, entity?: string, episode?: string }) => Promise<{ path, score, project, tags, date, text }[]>, count: () => Promise<number> }}
63 */
64 export function createSqliteVectorStore(config) {
65 const dataDir = config.data_dir || 'data';
66 const resolvedDir = path.isAbsolute(dataDir) ? dataDir : path.resolve(process.cwd(), dataDir);
67 if (!fs.existsSync(resolvedDir)) {
68 fs.mkdirSync(resolvedDir, { recursive: true });
69 }
70 const dbPath = path.join(resolvedDir, DB_FILENAME);
71 const db = new Database(dbPath);
72 sqliteVec.load(db);
73 const allowDimensionMigration = config?.allow_dimension_migration === true;
74
75 let _dimension = null;
76
77 return {
78 /** Close the database (for tests or cleanup). */
79 close() {
80 db.close();
81 },
82
83 async ensureCollection(dimension) {
84 _dimension = dimension;
85 const tableInfo = db.prepare(
86 "SELECT sql FROM sqlite_master WHERE type='table' AND name=?"
87 ).get(TABLE_NAME);
88 if (tableInfo && tableInfo.sql) {
89 const match = tableInfo.sql.match(/embedding\s+FLOAT\s*\[\s*(\d+)\s*]/i);
90 const existingDim = match ? parseInt(match[1], 10) : null;
91 const dimMismatch = existingDim != null && existingDim !== dimension;
92 if (dimMismatch && !allowDimensionMigration) {
93 throw new Error(
94 `Vector store dimension mismatch: table has ${existingDim}, requested ${dimension}. ` +
95 `Delete ${dbPath} and run "knowtation index" to re-index with the current embedding model.`
96 );
97 }
98 // Phase 15: multi-vault Hub shares one DB; older vec0 tables lacked vault_id — drop and recreate.
99 // Phase 19+ (`feat/bridge-embed-hash-cache`): older tables also lacked `+content_hash`,
100 // which is required for the bridge to skip re-embedding unchanged chunks. vec0 virtual
101 // tables cannot be ALTERed, so we drop + recreate.
102 // Phase 19a (`fix/bridge-vector-store-dim-and-hash-model-binding`): when
103 // `allow_dimension_migration` is set (bridge only), a dimension change also triggers
104 // drop + recreate. Vectors at the old dim are mathematically incompatible with queries
105 // at the new dim — there is no "migrate in place" option, only re-embed.
106 const hasVaultId = /\bvault_id\b/i.test(tableInfo.sql);
107 const hasContentHash = /\bcontent_hash\b/i.test(tableInfo.sql);
108 const needsRecreate = dimMismatch || !hasVaultId || !hasContentHash;
109 if (needsRecreate) {
110 if (dimMismatch) {
111 console.warn(
112 `[vector-store-sqlite] dimension migration: dropping ${TABLE_NAME} ` +
113 `(existing dim ${existingDim} → requested dim ${dimension}). ` +
114 'All vectors for all vaults in this DB will be removed and must be re-embedded. ' +
115 'This is expected after switching EMBEDDING_PROVIDER or EMBEDDING_MODEL.',
116 );
117 }
118 db.exec(`DROP TABLE ${TABLE_NAME}`);
119 } else {
120 return;
121 }
122 }
123 // vec0: id integer primary key, embedding float[dim], metadata columns, +auxiliary.
124 // `+content_hash` is auxiliary (not indexed) — read alongside id for the bridge cache check.
125 const sql = `CREATE VIRTUAL TABLE ${TABLE_NAME} USING vec0(
126 id INTEGER PRIMARY KEY,
127 embedding FLOAT[${dimension}],
128 path TEXT,
129 project TEXT,
130 date TEXT,
131 causal_chain_id TEXT,
132 episode_id TEXT,
133 +vault_id TEXT,
134 +tags TEXT,
135 +entity TEXT,
136 +chunk_text TEXT,
137 +content_hash TEXT,
138 +chunk_id TEXT
139 )`;
140 db.exec(sql);
141 },
142
143 async upsert(points) {
144 if (!points.length) return;
145 // vec0 does not support INSERT OR REPLACE / UPSERT (sqlite-vec #127); delete then insert.
146 const delStmt = db.prepare(`DELETE FROM ${TABLE_NAME} WHERE id = ?`);
147 const run = db.transaction((pts) => {
148 for (const p of pts) {
149 const id = pointIdFromChunkId(p.id);
150 const tagsStr = JSON.stringify(p.tags ?? []);
151 const entityStr = JSON.stringify(p.entity ?? []);
152 // vec0 TEXT metadata columns do not accept NULL; use empty string.
153 const esc = (v) => {
154 if (v == null || v === '') return "''";
155 return "'" + String(v).replace(/'/g, "''").replace(/\\/g, '\\\\') + "'";
156 };
157 const embeddingStr = JSON.stringify(Array.from(p.vector));
158 if (embeddingStr.includes("'")) {
159 throw new Error('Vector cannot contain single quote (invalid embedding).');
160 }
161 delStmt.run(id);
162 const vaultStr = esc(p.vault_id ?? 'default');
163 // `chunk_id` (string) lets the bridge's content-hash cache look up rows without
164 // having to invert the SHA-256-truncated integer id (`pointIdFromChunkId`).
165 const chunkIdStr = esc(p.id);
166 const contentHashStr = esc(p.content_hash);
167 db.exec(
168 `INSERT INTO ${TABLE_NAME} (id, embedding, path, project, date, causal_chain_id, episode_id, vault_id, tags, entity, chunk_text, content_hash, chunk_id) VALUES (${id}, '${embeddingStr}', ${esc(p.path)}, ${esc(p.project)}, ${esc(p.date)}, ${esc(p.causal_chain_id)}, ${esc(p.episode_id)}, ${vaultStr}, ${esc(tagsStr)}, ${esc(entityStr)}, ${esc(p.text)}, ${contentHashStr}, ${chunkIdStr})`
169 );
170 }
171 });
172 run(points);
173 },
174
175 async search(queryVector, options = {}) {
176 const limit = Math.min(options.limit ?? 10, 100);
177 const needsWideFetch =
178 options.folder ||
179 options.since ||
180 options.until ||
181 options.tag ||
182 options.entity ||
183 (options.vault_id != null && options.vault_id !== '');
184 // Multi-vault: KNN is global; a small vault can be absent from top-k unless k is large enough.
185 const vaultScoped = options.vault_id != null && options.vault_id !== '';
186 const pathPrefixFilter = options.folder && String(options.folder).trim() !== '';
187 let fetchLimit = limit;
188 if (pathPrefixFilter) {
189 fetchLimit = Math.min(Math.min(Math.max(limit * 100, 2000), 10000), MAX_VECTOR_KNN);
190 } else if (needsWideFetch) {
191 fetchLimit = Math.min(
192 Math.min(vaultScoped ? Math.max(limit * 25, 400) : limit * 3, vaultScoped ? 2000 : 300),
193 MAX_VECTOR_KNN,
194 );
195 } else {
196 fetchLimit = Math.min(fetchLimit, MAX_VECTOR_KNN);
197 }
198
199 const exists = db.prepare(
200 "SELECT 1 FROM sqlite_master WHERE type='table' AND name=?"
201 ).get(TABLE_NAME);
202 if (!exists) {
203 throw new Error('Vector store collection not found. Run "knowtation index" first to index your vault.');
204 }
205
206 const vecStr = JSON.stringify(Array.from(queryVector));
207 // vec0 KNN: one binding for the vector; k as literal to avoid binding order issues.
208 const stmt = db.prepare(`
209 SELECT id, path, project, date, causal_chain_id, episode_id, vault_id, tags, entity, chunk_text, distance
210 FROM ${TABLE_NAME}
211 WHERE embedding MATCH ? AND k = ${fetchLimit}
212 `);
213
214 let rows;
215 try {
216 rows = stmt.all(vecStr);
217 } catch (e) {
218 if (e.message && e.message.includes('no such table')) {
219 throw new Error('Vector store collection not found. Run "knowtation index" first to index your vault.');
220 }
221 throw e;
222 }
223
224 let hits = (rows || []).map((row) => {
225 let tags = [];
226 try {
227 tags = row.tags ? JSON.parse(row.tags) : [];
228 } catch (_) {}
229 // vec0 returns a distance (lower = more similar). It may be cosine-like in ~[0, 2] or
230 // L2 / other metrics >> 1. Using `max(0, 1 - d)` collapses every L2 hit to score 0.
231 const raw = row.distance ?? row.DISTANCE ?? row.vec_distance;
232 const distance = coerceVecDistance(raw);
233 const score = distance != null ? 1 / (1 + distance) : 0;
234 return {
235 path: row.path ?? '',
236 score,
237 vec_distance: distance,
238 project: row.project ?? null,
239 tags: Array.isArray(tags) ? tags : [],
240 date: row.date ?? null,
241 text: row.chunk_text ?? null,
242 _vault_id: row.vault_id || 'default',
243 _entity: row.entity,
244 _causal_chain_id: row.causal_chain_id,
245 _episode_id: row.episode_id,
246 };
247 });
248
249 const vaultWant = options.vault_id;
250 if (vaultWant != null && vaultWant !== '') {
251 hits = hits.filter((h) => (h._vault_id || 'default') === vaultWant);
252 }
253
254 if (options.project != null && options.project !== '') {
255 hits = hits.filter((h) => h.project === options.project);
256 }
257 if (options.chain != null && options.chain !== '') {
258 hits = hits.filter((h) => h._causal_chain_id === options.chain);
259 }
260 if (options.episode != null && options.episode !== '') {
261 hits = hits.filter((h) => h._episode_id === options.episode);
262 }
263 if (options.tag != null && options.tag !== '') {
264 const tag = options.tag;
265 hits = hits.filter((h) => Array.isArray(h.tags) && h.tags.includes(tag));
266 }
267 if (options.entity != null && options.entity !== '') {
268 const entity = options.entity;
269 hits = hits.filter((h) => {
270 let arr = [];
271 try {
272 arr = h._entity ? JSON.parse(h._entity) : [];
273 } catch (_) {}
274 return Array.isArray(arr) && arr.includes(entity);
275 });
276 }
277 hits.forEach((h) => {
278 delete h._vault_id;
279 delete h._entity;
280 delete h._causal_chain_id;
281 delete h._episode_id;
282 });
283
284 const folder = options.folder;
285 if (folder && typeof folder === 'string') {
286 const prefix = folder.replace(/\\/g, '/').replace(/\/$/, '') + '/';
287 const exact = folder.replace(/\\/g, '/').replace(/\/$/, '');
288 hits = hits.filter((h) => h.path === exact || h.path.startsWith(prefix));
289 }
290 const since = dateToComparable(options.since);
291 const until = dateToComparable(options.until);
292 if (since || until) {
293 hits = hits.filter((h) => {
294 const d = dateToComparable(h.date);
295 if (!d) return false;
296 if (since && d < since) return false;
297 if (until && d > until) return false;
298 return true;
299 });
300 }
301 const order = options.order;
302 if (order === 'date-asc') {
303 hits.sort((a, b) => (a.date || '').localeCompare(b.date || ''));
304 } else if (order === 'date') {
305 hits.sort((a, b) => (b.date || '').localeCompare(a.date || ''));
306 }
307 return hits.slice(0, limit);
308 },
309
310 async count() {
311 const exists = db.prepare(
312 "SELECT 1 FROM sqlite_master WHERE type='table' AND name=?"
313 ).get(TABLE_NAME);
314 if (!exists) return 0;
315 const row = db.prepare(`SELECT COUNT(*) AS c FROM ${TABLE_NAME}`).get();
316 return row?.c ?? 0;
317 },
318
319 /**
320 * Remove all indexed chunks for a Hub vault id (Phase 15 multi-vault).
321 * @param {string} vaultId
322 * @returns {Promise<number>} rows deleted
323 */
324 async deleteByVaultId(vaultId) {
325 if (vaultId == null || vaultId === '') return 0;
326 const vid = String(vaultId).trim();
327 if (!vid) return 0;
328 const exists = db.prepare(
329 "SELECT 1 FROM sqlite_master WHERE type='table' AND name=?"
330 ).get(TABLE_NAME);
331 if (!exists) return 0;
332 const info = db.prepare(`DELETE FROM ${TABLE_NAME} WHERE vault_id = ?`).run(vid);
333 return typeof info.changes === 'number' ? info.changes : 0;
334 },
335
336 /**
337 * Read existing `(chunk_id → content_hash)` pairs for one vault. Used by
338 * `hub/bridge/server.mjs POST /api/v1/index` to skip re-embedding chunks
339 * whose text + metadata are unchanged since the last successful index.
340 *
341 * Rows that pre-date the `+content_hash` column will not appear here (the
342 * `ensureCollection` migration drops + recreates such tables, so the cache
343 * is always either fully populated for current rows or empty after a one-
344 * time migration). Rows with an empty content_hash (theoretically possible
345 * from older buggy writers) are skipped so they get re-embedded.
346 *
347 * @param {string} vaultId
348 * @returns {Promise<Map<string, string>>} chunk_id → content_hash. Empty if collection missing.
349 */
350 async getChunkHashes(vaultId) {
351 if (vaultId == null || vaultId === '') return new Map();
352 const vid = String(vaultId).trim();
353 if (!vid) return new Map();
354 const exists = db.prepare(
355 "SELECT 1 FROM sqlite_master WHERE type='table' AND name=?"
356 ).get(TABLE_NAME);
357 if (!exists) return new Map();
358 const rows = db
359 .prepare(`SELECT chunk_id, content_hash FROM ${TABLE_NAME} WHERE vault_id = ?`)
360 .all(vid);
361 const out = new Map();
362 for (const row of rows || []) {
363 const cid = row?.chunk_id;
364 const ch = row?.content_hash;
365 if (typeof cid !== 'string' || cid === '') continue;
366 if (typeof ch !== 'string' || ch === '') continue;
367 out.set(cid, ch);
368 }
369 return out;
370 },
371
372 /**
373 * Delete rows by their string chunk_id. Used by the bridge incremental
374 * index to remove orphans (chunks that existed in the previous successful
375 * index but are no longer present in the current canister export, e.g.
376 * after a note was deleted or its path was renamed).
377 *
378 * Internally hashes each chunk_id to its integer primary key (same as
379 * `upsert`), so the API surface stays string-based for callers.
380 *
381 * @param {string[]} chunkIds
382 * @returns {Promise<number>} rows deleted
383 */
384 async deleteByChunkIds(chunkIds) {
385 if (!Array.isArray(chunkIds) || chunkIds.length === 0) return 0;
386 const exists = db.prepare(
387 "SELECT 1 FROM sqlite_master WHERE type='table' AND name=?"
388 ).get(TABLE_NAME);
389 if (!exists) return 0;
390 const stmt = db.prepare(`DELETE FROM ${TABLE_NAME} WHERE id = ?`);
391 let total = 0;
392 const run = db.transaction((ids) => {
393 for (const cid of ids) {
394 if (typeof cid !== 'string' || cid === '') continue;
395 const id = pointIdFromChunkId(cid);
396 const info = stmt.run(id);
397 if (info && typeof info.changes === 'number') total += info.changes;
398 }
399 });
400 run(chunkIds);
401 return total;
402 },
403 };
404 }
File History 2 commits
sha256:8d46372e39d2d5a54fd93a8b1c27922fe0d9b22a72197345f1d2c71701cc4ce2 feat(auth): persistent login system + C7 session introspection Human minor 16 days ago