indexer.mjs
sha256:41d741fb345c4abdb640838aa3d847de02ccffd7a39fce04894e743e683b50d0
fix(security): pin patched transitive deps to clear Dependa…
Human
minor
⚠ breaking
7 days ago
| 1 | /** |
| 2 | * Indexer: vault → list notes → chunk → embed → upsert. Idempotent; stable chunk ids. SPEC §5. |
| 3 | */ |
| 4 | |
| 5 | import { loadConfig } from './config.mjs'; |
| 6 | import { listMarkdownFiles, readNote } from './vault.mjs'; |
| 7 | import { chunkNote } from './chunk.mjs'; |
| 8 | import { embed, embeddingDimension } from './embedding.mjs'; |
| 9 | import { createVectorStore } from './vector-store.mjs'; |
| 10 | |
| 11 | const BATCH_EMBED = 10; |
| 12 | const BATCH_UPSERT = 50; |
| 13 | |
| 14 | const PROGRESS_ITEM_STEP = 10; |
| 15 | const PROGRESS_MS = 5000; |
| 16 | |
| 17 | /** |
| 18 | * @param {(p: { progress: number, total?: number, message?: string }) => void | Promise<void>} onProgress |
| 19 | * @param {number} minStep |
| 20 | * @param {number} minMs |
| 21 | */ |
| 22 | function createThrottledProgress(onProgress, minStep, minMs) { |
| 23 | let lastT = 0; |
| 24 | let lastV = -1e9; |
| 25 | return async (v, total, message, force = false) => { |
| 26 | const now = Date.now(); |
| 27 | if (!force && v - lastV < minStep && now - lastT < minMs) return; |
| 28 | lastT = now; |
| 29 | lastV = v; |
| 30 | await onProgress({ progress: v, total, message }); |
| 31 | }; |
| 32 | } |
| 33 | |
| 34 | /** |
| 35 | * Run full index: load config, list vault md files, chunk each note, embed, upsert to vector store. |
| 36 | * Logs progress to stderr. Respects config ignore patterns. |
| 37 | * @param {{ |
| 38 | * log?: (msg: string) => void, |
| 39 | * vaultId?: string, |
| 40 | * vaultPath?: string, |
| 41 | * onProgress?: (p: { progress: number, total?: number, message?: string }) => void | Promise<void> |
| 42 | * }} options - log defaults to console.error; vaultId/vaultPath for multi-vault (hub); onProgress for MCP Phase H (throttled: every 10 items or 5s) |
| 43 | * @returns {{ notesProcessed: number, chunksIndexed: number }} |
| 44 | * @throws on config/embed/store failure |
| 45 | */ |
| 46 | export async function runIndex(options = {}) { |
| 47 | const log = options.log || ((msg) => console.error(msg)); |
| 48 | const emit = options.onProgress; |
| 49 | const notesProgress = emit ? createThrottledProgress(emit, PROGRESS_ITEM_STEP, PROGRESS_MS) : null; |
| 50 | const embedProgress = emit ? createThrottledProgress(emit, PROGRESS_ITEM_STEP, PROGRESS_MS) : null; |
| 51 | const upsertProgress = emit ? createThrottledProgress(emit, PROGRESS_ITEM_STEP, PROGRESS_MS) : null; |
| 52 | |
| 53 | const config = loadConfig(); |
| 54 | const vaultPath = options.vaultPath ?? config.vault_path; |
| 55 | const vaultId = options.vaultId ?? 'default'; |
| 56 | const storeType = config.vector_store || 'qdrant'; |
| 57 | if (storeType === 'qdrant' && !config.qdrant_url) { |
| 58 | throw new Error('qdrant_url is required for indexing when using Qdrant. Set in config/local.yaml or QDRANT_URL.'); |
| 59 | } |
| 60 | if (storeType !== 'qdrant' && storeType !== 'sqlite-vec') { |
| 61 | throw new Error(`Vector store "${storeType}" is not implemented. Use vector_store: qdrant or sqlite-vec.`); |
| 62 | } |
| 63 | |
| 64 | const paths = listMarkdownFiles(vaultPath, { ignore: config.ignore }); |
| 65 | log(`Vault: ${vaultPath} (${vaultId}); ${paths.length} note(s) to index.`); |
| 66 | |
| 67 | const allChunks = []; |
| 68 | let noteIndex = 0; |
| 69 | for (const relPath of paths) { |
| 70 | try { |
| 71 | const note = readNote(vaultPath, relPath); |
| 72 | const opts = { |
| 73 | chunkSize: config.indexer?.chunk_size ?? 2048, |
| 74 | chunkOverlap: config.indexer?.chunk_overlap ?? 256, |
| 75 | }; |
| 76 | const chunks = chunkNote(note, opts); |
| 77 | for (const c of chunks) { |
| 78 | allChunks.push(c); |
| 79 | } |
| 80 | } catch (e) { |
| 81 | log(`Skip ${relPath}: ${e.message}`); |
| 82 | } |
| 83 | noteIndex += 1; |
| 84 | await notesProgress?.( |
| 85 | noteIndex, |
| 86 | paths.length, |
| 87 | `Chunking notes ${noteIndex}/${paths.length}`, |
| 88 | noteIndex === 1 || noteIndex === paths.length |
| 89 | ); |
| 90 | } |
| 91 | |
| 92 | if (allChunks.length === 0) { |
| 93 | log('No chunks to index.'); |
| 94 | const store = await createVectorStore(config); |
| 95 | const dim = embeddingDimension(config.embedding); |
| 96 | await store.ensureCollection(dim); |
| 97 | await embedProgress?.(0, 0, 'No chunks to embed', true); |
| 98 | await upsertProgress?.(0, 0, 'Nothing to upsert', true); |
| 99 | return { notesProcessed: paths.length, chunksIndexed: 0 }; |
| 100 | } |
| 101 | |
| 102 | log(`Embedding ${allChunks.length} chunk(s) with ${config.embedding?.provider || 'ollama'}/${config.embedding?.model || 'nomic-embed-text'}...`); |
| 103 | const vectors = []; |
| 104 | for (let i = 0; i < allChunks.length; i += BATCH_EMBED) { |
| 105 | const batch = allChunks.slice(i, i + BATCH_EMBED); |
| 106 | const texts = batch.map((c) => c.text); |
| 107 | const batchVectors = await embed(texts, config.embedding, { voyageInputType: 'document' }); |
| 108 | for (let j = 0; j < batch.length; j++) { |
| 109 | vectors.push(batchVectors[j] || []); |
| 110 | } |
| 111 | const done = Math.min(i + BATCH_EMBED, allChunks.length); |
| 112 | await embedProgress?.( |
| 113 | done, |
| 114 | allChunks.length, |
| 115 | `Embedding chunks ${done}/${allChunks.length}`, |
| 116 | done === allChunks.length || i === 0 |
| 117 | ); |
| 118 | if (i + BATCH_EMBED < allChunks.length) { |
| 119 | log(` embedded ${done}/${allChunks.length}`); |
| 120 | } |
| 121 | } |
| 122 | |
| 123 | const dim = embeddingDimension(config.embedding); |
| 124 | const store = await createVectorStore(config); |
| 125 | await store.ensureCollection(dim); |
| 126 | |
| 127 | for (let i = 0; i < allChunks.length; i += BATCH_UPSERT) { |
| 128 | const batch = allChunks.slice(i, i + BATCH_UPSERT); |
| 129 | const points = batch.map((chunk, j) => ({ |
| 130 | id: `${vaultId}::${chunk.id}`, |
| 131 | vector: vectors[i + j] || [], |
| 132 | text: chunk.text, |
| 133 | path: chunk.path, |
| 134 | vault_id: vaultId, |
| 135 | project: chunk.project, |
| 136 | tags: chunk.tags, |
| 137 | date: chunk.date, |
| 138 | causal_chain_id: chunk.causal_chain_id, |
| 139 | entity: chunk.entity, |
| 140 | episode_id: chunk.episode_id, |
| 141 | })); |
| 142 | await store.upsert(points); |
| 143 | const done = Math.min(i + BATCH_UPSERT, allChunks.length); |
| 144 | await upsertProgress?.( |
| 145 | done, |
| 146 | allChunks.length, |
| 147 | `Upserting chunks ${done}/${allChunks.length}`, |
| 148 | done === allChunks.length || i === 0 |
| 149 | ); |
| 150 | log(` upserted ${done}/${allChunks.length}`); |
| 151 | } |
| 152 | |
| 153 | log(`Done. ${paths.length} note(s), ${allChunks.length} chunk(s) indexed.`); |
| 154 | return { notesProcessed: paths.length, chunksIndexed: allChunks.length }; |
| 155 | } |
File History
1 commit
sha256:41d741fb345c4abdb640838aa3d847de02ccffd7a39fce04894e743e683b50d0
fix(security): pin patched transitive deps to clear Dependa…
Human
minor
⚠
7 days ago