vector-store-sqlite.mjs
sha256:8d46372e39d2d5a54fd93a8b1c27922fe0d9b22a72197345f1d2c71701cc4ce2
feat(auth): persistent login system + C7 session introspection
Human
minor
⚠ breaking
16 days ago
| 1 | /** |
| 2 | * sqlite-vec backend for vector store. Phase 10. |
| 3 | * Same interface as Qdrant: ensureCollection(dimension), upsert(points), search(queryVector, options), count(). |
| 4 | * DB path: data_dir/knowtation_vectors.db |
| 5 | */ |
| 6 | |
| 7 | import path from 'path'; |
| 8 | import fs from 'fs'; |
| 9 | import crypto from 'crypto'; |
| 10 | import Database from 'better-sqlite3'; |
| 11 | import * as sqliteVec from 'sqlite-vec'; |
| 12 | import { MAX_VECTOR_KNN } from './vector-knn-limit.mjs'; |
| 13 | |
| 14 | const TABLE_NAME = 'knowtation_vec'; |
| 15 | const DB_FILENAME = 'knowtation_vectors.db'; |
| 16 | |
| 17 | /** |
| 18 | * sqlite-vec / better-sqlite3 may return `distance` as number, bigint, string, or (rarely) Buffer. |
| 19 | * @param {unknown} raw |
| 20 | * @returns {number|null} |
| 21 | */ |
| 22 | function coerceVecDistance(raw) { |
| 23 | if (raw == null) return null; |
| 24 | if (typeof raw === 'number' && Number.isFinite(raw) && raw >= 0) return raw; |
| 25 | if (typeof raw === 'bigint') { |
| 26 | const n = Number(raw); |
| 27 | return Number.isFinite(n) && n >= 0 ? n : null; |
| 28 | } |
| 29 | if (typeof raw === 'string') { |
| 30 | const n = Number(raw); |
| 31 | return Number.isFinite(n) && n >= 0 ? n : null; |
| 32 | } |
| 33 | if (typeof Buffer !== 'undefined' && Buffer.isBuffer(raw) && raw.length === 8) { |
| 34 | const n = raw.readDoubleLE(0); |
| 35 | return Number.isFinite(n) && n >= 0 ? n : null; |
| 36 | } |
| 37 | return null; |
| 38 | } |
| 39 | |
| 40 | function pointIdFromChunkId(chunkId) { |
| 41 | const buf = crypto.createHash('sha256').update(chunkId).digest(); |
| 42 | return buf.readUInt32BE(0); |
| 43 | } |
| 44 | |
| 45 | function dateToComparable(d) { |
| 46 | if (d == null || typeof d !== 'string') return ''; |
| 47 | return d.trim().slice(0, 10) || ''; |
| 48 | } |
| 49 | |
| 50 | /** |
| 51 | * Create sqlite-vec vector store. Uses data_dir for DB path. |
| 52 | * @param {{ |
| 53 | * vector_store: string, |
| 54 | * data_dir: string, |
| 55 | * allow_dimension_migration?: boolean, |
| 56 | * }} config - `allow_dimension_migration: true` lets `ensureCollection` drop + recreate |
| 57 | * the table when the persisted dimension differs from the requested dimension. Bridge |
| 58 | * sets this because it owns the data lifecycle (downloads from blob, indexes, uploads |
| 59 | * to blob); a provider switch (e.g. OpenAI 1536 → DeepInfra 1024) can only resolve via |
| 60 | * a full re-embed. CLI leaves it OFF so an accidental EMBEDDING_PROVIDER swap surfaces |
| 61 | * loudly instead of silently corrupting search. |
| 62 | * @returns {{ ensureCollection: (dimension: number) => Promise<void>, upsert: (points) => Promise<void>, search: (queryVector: number[], options?: { limit?: number, vault_id?: string, project?: string, tag?: string, folder?: string, since?: string, until?: string, order?: string, chain?: string, entity?: string, episode?: string }) => Promise<{ path, score, project, tags, date, text }[]>, count: () => Promise<number> }} |
| 63 | */ |
| 64 | export function createSqliteVectorStore(config) { |
| 65 | const dataDir = config.data_dir || 'data'; |
| 66 | const resolvedDir = path.isAbsolute(dataDir) ? dataDir : path.resolve(process.cwd(), dataDir); |
| 67 | if (!fs.existsSync(resolvedDir)) { |
| 68 | fs.mkdirSync(resolvedDir, { recursive: true }); |
| 69 | } |
| 70 | const dbPath = path.join(resolvedDir, DB_FILENAME); |
| 71 | const db = new Database(dbPath); |
| 72 | sqliteVec.load(db); |
| 73 | const allowDimensionMigration = config?.allow_dimension_migration === true; |
| 74 | |
| 75 | let _dimension = null; |
| 76 | |
| 77 | return { |
| 78 | /** Close the database (for tests or cleanup). */ |
| 79 | close() { |
| 80 | db.close(); |
| 81 | }, |
| 82 | |
| 83 | async ensureCollection(dimension) { |
| 84 | _dimension = dimension; |
| 85 | const tableInfo = db.prepare( |
| 86 | "SELECT sql FROM sqlite_master WHERE type='table' AND name=?" |
| 87 | ).get(TABLE_NAME); |
| 88 | if (tableInfo && tableInfo.sql) { |
| 89 | const match = tableInfo.sql.match(/embedding\s+FLOAT\s*\[\s*(\d+)\s*]/i); |
| 90 | const existingDim = match ? parseInt(match[1], 10) : null; |
| 91 | const dimMismatch = existingDim != null && existingDim !== dimension; |
| 92 | if (dimMismatch && !allowDimensionMigration) { |
| 93 | throw new Error( |
| 94 | `Vector store dimension mismatch: table has ${existingDim}, requested ${dimension}. ` + |
| 95 | `Delete ${dbPath} and run "knowtation index" to re-index with the current embedding model.` |
| 96 | ); |
| 97 | } |
| 98 | // Phase 15: multi-vault Hub shares one DB; older vec0 tables lacked vault_id — drop and recreate. |
| 99 | // Phase 19+ (`feat/bridge-embed-hash-cache`): older tables also lacked `+content_hash`, |
| 100 | // which is required for the bridge to skip re-embedding unchanged chunks. vec0 virtual |
| 101 | // tables cannot be ALTERed, so we drop + recreate. |
| 102 | // Phase 19a (`fix/bridge-vector-store-dim-and-hash-model-binding`): when |
| 103 | // `allow_dimension_migration` is set (bridge only), a dimension change also triggers |
| 104 | // drop + recreate. Vectors at the old dim are mathematically incompatible with queries |
| 105 | // at the new dim — there is no "migrate in place" option, only re-embed. |
| 106 | const hasVaultId = /\bvault_id\b/i.test(tableInfo.sql); |
| 107 | const hasContentHash = /\bcontent_hash\b/i.test(tableInfo.sql); |
| 108 | const needsRecreate = dimMismatch || !hasVaultId || !hasContentHash; |
| 109 | if (needsRecreate) { |
| 110 | if (dimMismatch) { |
| 111 | console.warn( |
| 112 | `[vector-store-sqlite] dimension migration: dropping ${TABLE_NAME} ` + |
| 113 | `(existing dim ${existingDim} → requested dim ${dimension}). ` + |
| 114 | 'All vectors for all vaults in this DB will be removed and must be re-embedded. ' + |
| 115 | 'This is expected after switching EMBEDDING_PROVIDER or EMBEDDING_MODEL.', |
| 116 | ); |
| 117 | } |
| 118 | db.exec(`DROP TABLE ${TABLE_NAME}`); |
| 119 | } else { |
| 120 | return; |
| 121 | } |
| 122 | } |
| 123 | // vec0: id integer primary key, embedding float[dim], metadata columns, +auxiliary. |
| 124 | // `+content_hash` is auxiliary (not indexed) — read alongside id for the bridge cache check. |
| 125 | const sql = `CREATE VIRTUAL TABLE ${TABLE_NAME} USING vec0( |
| 126 | id INTEGER PRIMARY KEY, |
| 127 | embedding FLOAT[${dimension}], |
| 128 | path TEXT, |
| 129 | project TEXT, |
| 130 | date TEXT, |
| 131 | causal_chain_id TEXT, |
| 132 | episode_id TEXT, |
| 133 | +vault_id TEXT, |
| 134 | +tags TEXT, |
| 135 | +entity TEXT, |
| 136 | +chunk_text TEXT, |
| 137 | +content_hash TEXT, |
| 138 | +chunk_id TEXT |
| 139 | )`; |
| 140 | db.exec(sql); |
| 141 | }, |
| 142 | |
| 143 | async upsert(points) { |
| 144 | if (!points.length) return; |
| 145 | // vec0 does not support INSERT OR REPLACE / UPSERT (sqlite-vec #127); delete then insert. |
| 146 | const delStmt = db.prepare(`DELETE FROM ${TABLE_NAME} WHERE id = ?`); |
| 147 | const run = db.transaction((pts) => { |
| 148 | for (const p of pts) { |
| 149 | const id = pointIdFromChunkId(p.id); |
| 150 | const tagsStr = JSON.stringify(p.tags ?? []); |
| 151 | const entityStr = JSON.stringify(p.entity ?? []); |
| 152 | // vec0 TEXT metadata columns do not accept NULL; use empty string. |
| 153 | const esc = (v) => { |
| 154 | if (v == null || v === '') return "''"; |
| 155 | return "'" + String(v).replace(/'/g, "''").replace(/\\/g, '\\\\') + "'"; |
| 156 | }; |
| 157 | const embeddingStr = JSON.stringify(Array.from(p.vector)); |
| 158 | if (embeddingStr.includes("'")) { |
| 159 | throw new Error('Vector cannot contain single quote (invalid embedding).'); |
| 160 | } |
| 161 | delStmt.run(id); |
| 162 | const vaultStr = esc(p.vault_id ?? 'default'); |
| 163 | // `chunk_id` (string) lets the bridge's content-hash cache look up rows without |
| 164 | // having to invert the SHA-256-truncated integer id (`pointIdFromChunkId`). |
| 165 | const chunkIdStr = esc(p.id); |
| 166 | const contentHashStr = esc(p.content_hash); |
| 167 | db.exec( |
| 168 | `INSERT INTO ${TABLE_NAME} (id, embedding, path, project, date, causal_chain_id, episode_id, vault_id, tags, entity, chunk_text, content_hash, chunk_id) VALUES (${id}, '${embeddingStr}', ${esc(p.path)}, ${esc(p.project)}, ${esc(p.date)}, ${esc(p.causal_chain_id)}, ${esc(p.episode_id)}, ${vaultStr}, ${esc(tagsStr)}, ${esc(entityStr)}, ${esc(p.text)}, ${contentHashStr}, ${chunkIdStr})` |
| 169 | ); |
| 170 | } |
| 171 | }); |
| 172 | run(points); |
| 173 | }, |
| 174 | |
| 175 | async search(queryVector, options = {}) { |
| 176 | const limit = Math.min(options.limit ?? 10, 100); |
| 177 | const needsWideFetch = |
| 178 | options.folder || |
| 179 | options.since || |
| 180 | options.until || |
| 181 | options.tag || |
| 182 | options.entity || |
| 183 | (options.vault_id != null && options.vault_id !== ''); |
| 184 | // Multi-vault: KNN is global; a small vault can be absent from top-k unless k is large enough. |
| 185 | const vaultScoped = options.vault_id != null && options.vault_id !== ''; |
| 186 | const pathPrefixFilter = options.folder && String(options.folder).trim() !== ''; |
| 187 | let fetchLimit = limit; |
| 188 | if (pathPrefixFilter) { |
| 189 | fetchLimit = Math.min(Math.min(Math.max(limit * 100, 2000), 10000), MAX_VECTOR_KNN); |
| 190 | } else if (needsWideFetch) { |
| 191 | fetchLimit = Math.min( |
| 192 | Math.min(vaultScoped ? Math.max(limit * 25, 400) : limit * 3, vaultScoped ? 2000 : 300), |
| 193 | MAX_VECTOR_KNN, |
| 194 | ); |
| 195 | } else { |
| 196 | fetchLimit = Math.min(fetchLimit, MAX_VECTOR_KNN); |
| 197 | } |
| 198 | |
| 199 | const exists = db.prepare( |
| 200 | "SELECT 1 FROM sqlite_master WHERE type='table' AND name=?" |
| 201 | ).get(TABLE_NAME); |
| 202 | if (!exists) { |
| 203 | throw new Error('Vector store collection not found. Run "knowtation index" first to index your vault.'); |
| 204 | } |
| 205 | |
| 206 | const vecStr = JSON.stringify(Array.from(queryVector)); |
| 207 | // vec0 KNN: one binding for the vector; k as literal to avoid binding order issues. |
| 208 | const stmt = db.prepare(` |
| 209 | SELECT id, path, project, date, causal_chain_id, episode_id, vault_id, tags, entity, chunk_text, distance |
| 210 | FROM ${TABLE_NAME} |
| 211 | WHERE embedding MATCH ? AND k = ${fetchLimit} |
| 212 | `); |
| 213 | |
| 214 | let rows; |
| 215 | try { |
| 216 | rows = stmt.all(vecStr); |
| 217 | } catch (e) { |
| 218 | if (e.message && e.message.includes('no such table')) { |
| 219 | throw new Error('Vector store collection not found. Run "knowtation index" first to index your vault.'); |
| 220 | } |
| 221 | throw e; |
| 222 | } |
| 223 | |
| 224 | let hits = (rows || []).map((row) => { |
| 225 | let tags = []; |
| 226 | try { |
| 227 | tags = row.tags ? JSON.parse(row.tags) : []; |
| 228 | } catch (_) {} |
| 229 | // vec0 returns a distance (lower = more similar). It may be cosine-like in ~[0, 2] or |
| 230 | // L2 / other metrics >> 1. Using `max(0, 1 - d)` collapses every L2 hit to score 0. |
| 231 | const raw = row.distance ?? row.DISTANCE ?? row.vec_distance; |
| 232 | const distance = coerceVecDistance(raw); |
| 233 | const score = distance != null ? 1 / (1 + distance) : 0; |
| 234 | return { |
| 235 | path: row.path ?? '', |
| 236 | score, |
| 237 | vec_distance: distance, |
| 238 | project: row.project ?? null, |
| 239 | tags: Array.isArray(tags) ? tags : [], |
| 240 | date: row.date ?? null, |
| 241 | text: row.chunk_text ?? null, |
| 242 | _vault_id: row.vault_id || 'default', |
| 243 | _entity: row.entity, |
| 244 | _causal_chain_id: row.causal_chain_id, |
| 245 | _episode_id: row.episode_id, |
| 246 | }; |
| 247 | }); |
| 248 | |
| 249 | const vaultWant = options.vault_id; |
| 250 | if (vaultWant != null && vaultWant !== '') { |
| 251 | hits = hits.filter((h) => (h._vault_id || 'default') === vaultWant); |
| 252 | } |
| 253 | |
| 254 | if (options.project != null && options.project !== '') { |
| 255 | hits = hits.filter((h) => h.project === options.project); |
| 256 | } |
| 257 | if (options.chain != null && options.chain !== '') { |
| 258 | hits = hits.filter((h) => h._causal_chain_id === options.chain); |
| 259 | } |
| 260 | if (options.episode != null && options.episode !== '') { |
| 261 | hits = hits.filter((h) => h._episode_id === options.episode); |
| 262 | } |
| 263 | if (options.tag != null && options.tag !== '') { |
| 264 | const tag = options.tag; |
| 265 | hits = hits.filter((h) => Array.isArray(h.tags) && h.tags.includes(tag)); |
| 266 | } |
| 267 | if (options.entity != null && options.entity !== '') { |
| 268 | const entity = options.entity; |
| 269 | hits = hits.filter((h) => { |
| 270 | let arr = []; |
| 271 | try { |
| 272 | arr = h._entity ? JSON.parse(h._entity) : []; |
| 273 | } catch (_) {} |
| 274 | return Array.isArray(arr) && arr.includes(entity); |
| 275 | }); |
| 276 | } |
| 277 | hits.forEach((h) => { |
| 278 | delete h._vault_id; |
| 279 | delete h._entity; |
| 280 | delete h._causal_chain_id; |
| 281 | delete h._episode_id; |
| 282 | }); |
| 283 | |
| 284 | const folder = options.folder; |
| 285 | if (folder && typeof folder === 'string') { |
| 286 | const prefix = folder.replace(/\\/g, '/').replace(/\/$/, '') + '/'; |
| 287 | const exact = folder.replace(/\\/g, '/').replace(/\/$/, ''); |
| 288 | hits = hits.filter((h) => h.path === exact || h.path.startsWith(prefix)); |
| 289 | } |
| 290 | const since = dateToComparable(options.since); |
| 291 | const until = dateToComparable(options.until); |
| 292 | if (since || until) { |
| 293 | hits = hits.filter((h) => { |
| 294 | const d = dateToComparable(h.date); |
| 295 | if (!d) return false; |
| 296 | if (since && d < since) return false; |
| 297 | if (until && d > until) return false; |
| 298 | return true; |
| 299 | }); |
| 300 | } |
| 301 | const order = options.order; |
| 302 | if (order === 'date-asc') { |
| 303 | hits.sort((a, b) => (a.date || '').localeCompare(b.date || '')); |
| 304 | } else if (order === 'date') { |
| 305 | hits.sort((a, b) => (b.date || '').localeCompare(a.date || '')); |
| 306 | } |
| 307 | return hits.slice(0, limit); |
| 308 | }, |
| 309 | |
| 310 | async count() { |
| 311 | const exists = db.prepare( |
| 312 | "SELECT 1 FROM sqlite_master WHERE type='table' AND name=?" |
| 313 | ).get(TABLE_NAME); |
| 314 | if (!exists) return 0; |
| 315 | const row = db.prepare(`SELECT COUNT(*) AS c FROM ${TABLE_NAME}`).get(); |
| 316 | return row?.c ?? 0; |
| 317 | }, |
| 318 | |
| 319 | /** |
| 320 | * Remove all indexed chunks for a Hub vault id (Phase 15 multi-vault). |
| 321 | * @param {string} vaultId |
| 322 | * @returns {Promise<number>} rows deleted |
| 323 | */ |
| 324 | async deleteByVaultId(vaultId) { |
| 325 | if (vaultId == null || vaultId === '') return 0; |
| 326 | const vid = String(vaultId).trim(); |
| 327 | if (!vid) return 0; |
| 328 | const exists = db.prepare( |
| 329 | "SELECT 1 FROM sqlite_master WHERE type='table' AND name=?" |
| 330 | ).get(TABLE_NAME); |
| 331 | if (!exists) return 0; |
| 332 | const info = db.prepare(`DELETE FROM ${TABLE_NAME} WHERE vault_id = ?`).run(vid); |
| 333 | return typeof info.changes === 'number' ? info.changes : 0; |
| 334 | }, |
| 335 | |
| 336 | /** |
| 337 | * Read existing `(chunk_id → content_hash)` pairs for one vault. Used by |
| 338 | * `hub/bridge/server.mjs POST /api/v1/index` to skip re-embedding chunks |
| 339 | * whose text + metadata are unchanged since the last successful index. |
| 340 | * |
| 341 | * Rows that pre-date the `+content_hash` column will not appear here (the |
| 342 | * `ensureCollection` migration drops + recreates such tables, so the cache |
| 343 | * is always either fully populated for current rows or empty after a one- |
| 344 | * time migration). Rows with an empty content_hash (theoretically possible |
| 345 | * from older buggy writers) are skipped so they get re-embedded. |
| 346 | * |
| 347 | * @param {string} vaultId |
| 348 | * @returns {Promise<Map<string, string>>} chunk_id → content_hash. Empty if collection missing. |
| 349 | */ |
| 350 | async getChunkHashes(vaultId) { |
| 351 | if (vaultId == null || vaultId === '') return new Map(); |
| 352 | const vid = String(vaultId).trim(); |
| 353 | if (!vid) return new Map(); |
| 354 | const exists = db.prepare( |
| 355 | "SELECT 1 FROM sqlite_master WHERE type='table' AND name=?" |
| 356 | ).get(TABLE_NAME); |
| 357 | if (!exists) return new Map(); |
| 358 | const rows = db |
| 359 | .prepare(`SELECT chunk_id, content_hash FROM ${TABLE_NAME} WHERE vault_id = ?`) |
| 360 | .all(vid); |
| 361 | const out = new Map(); |
| 362 | for (const row of rows || []) { |
| 363 | const cid = row?.chunk_id; |
| 364 | const ch = row?.content_hash; |
| 365 | if (typeof cid !== 'string' || cid === '') continue; |
| 366 | if (typeof ch !== 'string' || ch === '') continue; |
| 367 | out.set(cid, ch); |
| 368 | } |
| 369 | return out; |
| 370 | }, |
| 371 | |
| 372 | /** |
| 373 | * Delete rows by their string chunk_id. Used by the bridge incremental |
| 374 | * index to remove orphans (chunks that existed in the previous successful |
| 375 | * index but are no longer present in the current canister export, e.g. |
| 376 | * after a note was deleted or its path was renamed). |
| 377 | * |
| 378 | * Internally hashes each chunk_id to its integer primary key (same as |
| 379 | * `upsert`), so the API surface stays string-based for callers. |
| 380 | * |
| 381 | * @param {string[]} chunkIds |
| 382 | * @returns {Promise<number>} rows deleted |
| 383 | */ |
| 384 | async deleteByChunkIds(chunkIds) { |
| 385 | if (!Array.isArray(chunkIds) || chunkIds.length === 0) return 0; |
| 386 | const exists = db.prepare( |
| 387 | "SELECT 1 FROM sqlite_master WHERE type='table' AND name=?" |
| 388 | ).get(TABLE_NAME); |
| 389 | if (!exists) return 0; |
| 390 | const stmt = db.prepare(`DELETE FROM ${TABLE_NAME} WHERE id = ?`); |
| 391 | let total = 0; |
| 392 | const run = db.transaction((ids) => { |
| 393 | for (const cid of ids) { |
| 394 | if (typeof cid !== 'string' || cid === '') continue; |
| 395 | const id = pointIdFromChunkId(cid); |
| 396 | const info = stmt.run(id); |
| 397 | if (info && typeof info.changes === 'number') total += info.changes; |
| 398 | } |
| 399 | }); |
| 400 | run(chunkIds); |
| 401 | return total; |
| 402 | }, |
| 403 | }; |
| 404 | } |
File History
2 commits
sha256:8d46372e39d2d5a54fd93a8b1c27922fe0d9b22a72197345f1d2c71701cc4ce2
feat(auth): persistent login system + C7 session introspection
Human
minor
⚠
16 days ago
sha256:6a102aafafdfe7e70a24f4e59740200f0ee713ce7915f1b53e9d4ba5ee8b4410
Initial Muse snapshot
Human
48 days ago