vector-store.mjs
sha256:8d46372e39d2d5a54fd93a8b1c27922fe0d9b22a72197345f1d2c71701cc4ce2
feat(auth): persistent login system + C7 session introspection
Human
minor
⚠ breaking
16 days ago
| 1 | /** |
| 2 | * Vector store abstraction. Qdrant backend for indexer (Phase 2); search in Phase 3. |
| 3 | * SPEC §5: metadata (path, project, tags, date); stable chunk id for upsert. |
| 4 | */ |
| 5 | |
| 6 | import crypto from 'crypto'; |
| 7 | import { QdrantClient } from '@qdrant/js-client-rest'; |
| 8 | import { MAX_VECTOR_KNN } from './vector-knn-limit.mjs'; |
| 9 | |
| 10 | const COLLECTION_NAME = 'knowtation'; |
| 11 | |
| 12 | /** |
| 13 | * Deterministic string to positive integer for Qdrant point id (upsert by same id = no duplicate). |
| 14 | */ |
| 15 | function pointIdFromChunkId(chunkId) { |
| 16 | const buf = crypto.createHash('sha256').update(chunkId).digest(); |
| 17 | return buf.readUInt32BE(0); |
| 18 | } |
| 19 | |
| 20 | /** |
| 21 | * Normalize date to YYYY-MM-DD for range comparison (Qdrant accepts string range). |
| 22 | * @param {string} d - ISO 8601 or YYYY-MM-DD |
| 23 | * @returns {string} |
| 24 | */ |
| 25 | function dateToComparable(d) { |
| 26 | if (d == null || typeof d !== 'string') return ''; |
| 27 | const s = d.trim().slice(0, 10); |
| 28 | return s || ''; |
| 29 | } |
| 30 | |
| 31 | /** |
| 32 | * Build Qdrant filter for project, tag, date range, vault_id, and optional chain/entity/episode. |
| 33 | * Folder (path prefix) is applied as post-filter. |
| 34 | * @param {{ project?: string, tag?: string, since?: string, until?: string, chain?: string, entity?: string, episode?: string, vault_id?: string }} filters |
| 35 | * @returns {{ must?: object[] }} or empty object |
| 36 | */ |
| 37 | function buildFilter(filters) { |
| 38 | const must = []; |
| 39 | if (filters.vault_id != null && filters.vault_id !== '') { |
| 40 | must.push({ key: 'vault_id', match: { value: filters.vault_id } }); |
| 41 | } |
| 42 | if (filters.project != null && filters.project !== '') { |
| 43 | must.push({ key: 'project', match: { value: filters.project } }); |
| 44 | } |
| 45 | if (filters.tag != null && filters.tag !== '') { |
| 46 | must.push({ key: 'tags', match: { any: [filters.tag] } }); |
| 47 | } |
| 48 | // Date range (since/until) applied as post-filter for compatibility with string payload |
| 49 | if (filters.chain != null && filters.chain !== '') { |
| 50 | must.push({ key: 'causal_chain_id', match: { value: filters.chain } }); |
| 51 | } |
| 52 | if (filters.entity != null && filters.entity !== '') { |
| 53 | must.push({ key: 'entity', match: { any: [filters.entity] } }); |
| 54 | } |
| 55 | if (filters.episode != null && filters.episode !== '') { |
| 56 | must.push({ key: 'episode_id', match: { value: filters.episode } }); |
| 57 | } |
| 58 | return must.length ? { must } : {}; |
| 59 | } |
| 60 | |
| 61 | /** |
| 62 | * Create vector store (Qdrant or sqlite-vec). Validates config. |
| 63 | * @param {{ qdrant_url?: string, vector_store?: string, data_dir?: string }} config |
| 64 | * @returns {{ ensureCollection: (dimension: number) => Promise<void>, upsert: (points) => Promise<void>, search: (queryVector: number[], options) => Promise<{ path, score, project, tags, date, text }[]>, count?: () => Promise<number> }} |
| 65 | */ |
| 66 | export async function createVectorStore(config) { |
| 67 | const store = config.vector_store || 'qdrant'; |
| 68 | if (store === 'sqlite-vec') { |
| 69 | const { createSqliteVectorStore } = await import('./vector-store-sqlite.mjs'); |
| 70 | return createSqliteVectorStore(config); |
| 71 | } |
| 72 | if (store !== 'qdrant') { |
| 73 | throw new Error(`Vector store "${store}" is not implemented. Use vector_store: qdrant or sqlite-vec.`); |
| 74 | } |
| 75 | const url = config.qdrant_url; |
| 76 | if (!url || typeof url !== 'string') { |
| 77 | throw new Error('Qdrant requires qdrant_url in config or QDRANT_URL env.'); |
| 78 | } |
| 79 | |
| 80 | const client = new QdrantClient({ url: url.replace(/\/$/, '') }); |
| 81 | |
| 82 | return { |
| 83 | async ensureCollection(dimension) { |
| 84 | try { |
| 85 | await client.getCollection(COLLECTION_NAME); |
| 86 | return; |
| 87 | } catch (_) { |
| 88 | // Collection does not exist; create it. |
| 89 | } |
| 90 | await client.createCollection(COLLECTION_NAME, { |
| 91 | vectors: { |
| 92 | size: dimension, |
| 93 | distance: 'Cosine', |
| 94 | }, |
| 95 | }); |
| 96 | }, |
| 97 | |
| 98 | async upsert(points) { |
| 99 | if (!points.length) return; |
| 100 | const payload = points.map((p) => ({ |
| 101 | id: pointIdFromChunkId(p.id), |
| 102 | vector: p.vector, |
| 103 | payload: { |
| 104 | chunk_id: p.id, |
| 105 | path: p.path, |
| 106 | vault_id: p.vault_id ?? 'default', |
| 107 | project: p.project ?? null, |
| 108 | tags: p.tags ?? [], |
| 109 | date: p.date ?? null, |
| 110 | text: p.text ?? null, |
| 111 | causal_chain_id: p.causal_chain_id ?? null, |
| 112 | entity: p.entity ?? [], |
| 113 | episode_id: p.episode_id ?? null, |
| 114 | // `content_hash` is the bridge-side cache key for skip-re-embed; mirror the |
| 115 | // sqlite-vec backend so both stores expose the same getChunkHashes surface. |
| 116 | content_hash: p.content_hash ?? null, |
| 117 | }, |
| 118 | })); |
| 119 | await client.upsert(COLLECTION_NAME, { |
| 120 | wait: true, |
| 121 | points: payload, |
| 122 | }); |
| 123 | }, |
| 124 | |
| 125 | /** |
| 126 | * Search by vector. Returns scored points with payload (path, project, tags, date, text). |
| 127 | * Folder filter (path prefix) and order are applied as post-filter/post-sort. |
| 128 | * @param {number[]} queryVector |
| 129 | * @param {{ limit?: number, vault_id?: string, project?: string, tag?: string, folder?: string, since?: string, until?: string, chain?: string, entity?: string, episode?: string, order?: string }} options |
| 130 | * @returns {Promise<{ path: string, score: number, project: string|null, tags: string[], date: string|null, text: string|null }[]>} |
| 131 | */ |
| 132 | async search(queryVector, options = {}) { |
| 133 | const limit = Math.min(options.limit ?? 10, 100); |
| 134 | const filter = buildFilter({ |
| 135 | vault_id: options.vault_id ?? undefined, |
| 136 | project: options.project ?? undefined, |
| 137 | tag: options.tag ?? undefined, |
| 138 | since: options.since ?? undefined, |
| 139 | until: options.until ?? undefined, |
| 140 | chain: options.chain ?? undefined, |
| 141 | entity: options.entity ?? undefined, |
| 142 | episode: options.episode ?? undefined, |
| 143 | }); |
| 144 | let scored; |
| 145 | try { |
| 146 | const pathPrefixFilter = options.folder && String(options.folder).trim() !== ''; |
| 147 | const knnLimit = Math.min( |
| 148 | pathPrefixFilter |
| 149 | ? Math.min(Math.max(limit * 100, 2000), 10000) |
| 150 | : options.since || options.until |
| 151 | ? Math.min(limit * 3, 300) |
| 152 | : limit, |
| 153 | MAX_VECTOR_KNN, |
| 154 | ); |
| 155 | scored = await client.search(COLLECTION_NAME, { |
| 156 | vector: queryVector, |
| 157 | limit: knnLimit, |
| 158 | filter: Object.keys(filter).length ? filter : undefined, |
| 159 | with_payload: true, |
| 160 | with_vector: false, |
| 161 | }); |
| 162 | } catch (e) { |
| 163 | if (e.message && (e.message.includes('Not found') || e.message.includes('404'))) { |
| 164 | throw new Error('Vector store collection not found. Run "knowtation index" first to index your vault.'); |
| 165 | } |
| 166 | throw e; |
| 167 | } |
| 168 | const scoredList = Array.isArray(scored) |
| 169 | ? scored |
| 170 | : scored && typeof scored === 'object' && Array.isArray(scored.points) |
| 171 | ? scored.points |
| 172 | : []; |
| 173 | let hits = scoredList.map((p) => { |
| 174 | const pl = p.payload || {}; |
| 175 | const rawSc = p.score; |
| 176 | let score = typeof rawSc === 'number' && Number.isFinite(rawSc) ? rawSc : Number(rawSc); |
| 177 | if (!Number.isFinite(score)) score = 0; |
| 178 | return { |
| 179 | path: pl.path ?? '', |
| 180 | score, |
| 181 | project: pl.project ?? null, |
| 182 | tags: Array.isArray(pl.tags) ? pl.tags : [], |
| 183 | date: pl.date ?? null, |
| 184 | text: pl.text ?? null, |
| 185 | }; |
| 186 | }); |
| 187 | const folder = options.folder; |
| 188 | if (folder && typeof folder === 'string') { |
| 189 | const prefix = folder.replace(/\\/g, '/').replace(/\/$/, '') + '/'; |
| 190 | const exact = folder.replace(/\\/g, '/').replace(/\/$/, ''); |
| 191 | hits = hits.filter((h) => h.path === exact || h.path.startsWith(prefix)); |
| 192 | } |
| 193 | const since = dateToComparable(options.since); |
| 194 | const until = dateToComparable(options.until); |
| 195 | if (since || until) { |
| 196 | hits = hits.filter((h) => { |
| 197 | const d = dateToComparable(h.date); |
| 198 | if (!d) return false; |
| 199 | if (since && d < since) return false; |
| 200 | if (until && d > until) return false; |
| 201 | return true; |
| 202 | }); |
| 203 | } |
| 204 | const order = options.order; |
| 205 | if (order === 'date-asc') { |
| 206 | hits.sort((a, b) => (a.date || '').localeCompare(b.date || '')); |
| 207 | } else if (order === 'date') { |
| 208 | hits.sort((a, b) => (b.date || '').localeCompare(a.date || '')); |
| 209 | } |
| 210 | hits = hits.slice(0, limit); |
| 211 | return hits; |
| 212 | }, |
| 213 | |
| 214 | /** |
| 215 | * Return total point count for the collection (for count-only or existence check). |
| 216 | * @returns {Promise<number>} |
| 217 | */ |
| 218 | async count() { |
| 219 | try { |
| 220 | const result = await client.count(COLLECTION_NAME); |
| 221 | return result?.count ?? 0; |
| 222 | } catch (e) { |
| 223 | if (e.message && (e.message.includes('Not found') || e.message.includes('404'))) { |
| 224 | return 0; |
| 225 | } |
| 226 | throw e; |
| 227 | } |
| 228 | }, |
| 229 | |
| 230 | /** |
| 231 | * Remove all points tagged with this Hub vault_id (Phase 15 multi-vault). |
| 232 | * @param {string} vaultId |
| 233 | * @returns {Promise<number>} 1 if delete ran; 0 if collection missing |
| 234 | */ |
| 235 | async deleteByVaultId(vaultId) { |
| 236 | if (vaultId == null || vaultId === '') return 0; |
| 237 | const vid = String(vaultId).trim(); |
| 238 | if (!vid) return 0; |
| 239 | const filter = buildFilter({ vault_id: vid }); |
| 240 | if (!filter.must || filter.must.length === 0) return 0; |
| 241 | try { |
| 242 | await client.delete(COLLECTION_NAME, { wait: true, filter }); |
| 243 | return 1; |
| 244 | } catch (e) { |
| 245 | if (e.message && (e.message.includes('Not found') || e.message.includes('404'))) { |
| 246 | return 0; |
| 247 | } |
| 248 | throw e; |
| 249 | } |
| 250 | }, |
| 251 | |
| 252 | /** |
| 253 | * Mirror of `vector-store-sqlite.getChunkHashes`: returns Map<chunk_id, content_hash> |
| 254 | * for the named vault. Implemented via Qdrant `scroll` (paginated) since `count` does |
| 255 | * not return payload. A null/empty content_hash is treated as cache miss and skipped. |
| 256 | * |
| 257 | * @param {string} vaultId |
| 258 | * @returns {Promise<Map<string, string>>} |
| 259 | */ |
| 260 | async getChunkHashes(vaultId) { |
| 261 | const out = new Map(); |
| 262 | if (vaultId == null || vaultId === '') return out; |
| 263 | const vid = String(vaultId).trim(); |
| 264 | if (!vid) return out; |
| 265 | const filter = buildFilter({ vault_id: vid }); |
| 266 | if (!filter.must || filter.must.length === 0) return out; |
| 267 | try { |
| 268 | let nextOffset = undefined; |
| 269 | const PAGE = 256; |
| 270 | // Bound iterations defensively (vault can have huge note count, but a runaway |
| 271 | // server would block the index handler past Netlify's sync cap). |
| 272 | for (let i = 0; i < 4096; i++) { |
| 273 | const page = await client.scroll(COLLECTION_NAME, { |
| 274 | filter, |
| 275 | with_payload: true, |
| 276 | with_vector: false, |
| 277 | limit: PAGE, |
| 278 | offset: nextOffset, |
| 279 | }); |
| 280 | const points = Array.isArray(page?.points) ? page.points : []; |
| 281 | for (const p of points) { |
| 282 | const cid = p?.payload?.chunk_id; |
| 283 | const ch = p?.payload?.content_hash; |
| 284 | if (typeof cid !== 'string' || cid === '') continue; |
| 285 | if (typeof ch !== 'string' || ch === '') continue; |
| 286 | out.set(cid, ch); |
| 287 | } |
| 288 | nextOffset = page?.next_page_offset; |
| 289 | if (!nextOffset || points.length === 0) break; |
| 290 | } |
| 291 | } catch (e) { |
| 292 | if (e.message && (e.message.includes('Not found') || e.message.includes('404'))) { |
| 293 | return out; |
| 294 | } |
| 295 | throw e; |
| 296 | } |
| 297 | return out; |
| 298 | }, |
| 299 | |
| 300 | /** |
| 301 | * Mirror of `vector-store-sqlite.deleteByChunkIds`: takes string chunk_ids and |
| 302 | * deletes the corresponding integer points. Used by the bridge to remove orphans |
| 303 | * (chunks present in the previous index but absent from the current export). |
| 304 | * |
| 305 | * @param {string[]} chunkIds |
| 306 | * @returns {Promise<number>} count requested for delete (Qdrant does not return per-id success) |
| 307 | */ |
| 308 | async deleteByChunkIds(chunkIds) { |
| 309 | if (!Array.isArray(chunkIds) || chunkIds.length === 0) return 0; |
| 310 | const ids = chunkIds |
| 311 | .filter((cid) => typeof cid === 'string' && cid !== '') |
| 312 | .map((cid) => pointIdFromChunkId(cid)); |
| 313 | if (ids.length === 0) return 0; |
| 314 | try { |
| 315 | await client.delete(COLLECTION_NAME, { wait: true, points: ids }); |
| 316 | return ids.length; |
| 317 | } catch (e) { |
| 318 | if (e.message && (e.message.includes('Not found') || e.message.includes('404'))) { |
| 319 | return 0; |
| 320 | } |
| 321 | throw e; |
| 322 | } |
| 323 | }, |
| 324 | }; |
| 325 | } |
| 326 | |
| 327 | export { COLLECTION_NAME }; |
File History
2 commits
sha256:8d46372e39d2d5a54fd93a8b1c27922fe0d9b22a72197345f1d2c71701cc4ce2
feat(auth): persistent login system + C7 session introspection
Human
minor
⚠
16 days ago
sha256:6a102aafafdfe7e70a24f4e59740200f0ee713ce7915f1b53e9d4ba5ee8b4410
Initial Muse snapshot
Human
48 days ago