indexer.mjs
155 lines 5.7 KB
Raw
sha256:41d741fb345c4abdb640838aa3d847de02ccffd7a39fce04894e743e683b50d0 fix(security): pin patched transitive deps to clear Dependa… Human minor ⚠ breaking 7 days ago
1 /**
2 * Indexer: vault → list notes → chunk → embed → upsert. Idempotent; stable chunk ids. SPEC §5.
3 */
4
5 import { loadConfig } from './config.mjs';
6 import { listMarkdownFiles, readNote } from './vault.mjs';
7 import { chunkNote } from './chunk.mjs';
8 import { embed, embeddingDimension } from './embedding.mjs';
9 import { createVectorStore } from './vector-store.mjs';
10
11 const BATCH_EMBED = 10;
12 const BATCH_UPSERT = 50;
13
14 const PROGRESS_ITEM_STEP = 10;
15 const PROGRESS_MS = 5000;
16
17 /**
18 * @param {(p: { progress: number, total?: number, message?: string }) => void | Promise<void>} onProgress
19 * @param {number} minStep
20 * @param {number} minMs
21 */
22 function createThrottledProgress(onProgress, minStep, minMs) {
23 let lastT = 0;
24 let lastV = -1e9;
25 return async (v, total, message, force = false) => {
26 const now = Date.now();
27 if (!force && v - lastV < minStep && now - lastT < minMs) return;
28 lastT = now;
29 lastV = v;
30 await onProgress({ progress: v, total, message });
31 };
32 }
33
34 /**
35 * Run full index: load config, list vault md files, chunk each note, embed, upsert to vector store.
36 * Logs progress to stderr. Respects config ignore patterns.
37 * @param {{
38 * log?: (msg: string) => void,
39 * vaultId?: string,
40 * vaultPath?: string,
41 * onProgress?: (p: { progress: number, total?: number, message?: string }) => void | Promise<void>
42 * }} options - log defaults to console.error; vaultId/vaultPath for multi-vault (hub); onProgress for MCP Phase H (throttled: every 10 items or 5s)
43 * @returns {{ notesProcessed: number, chunksIndexed: number }}
44 * @throws on config/embed/store failure
45 */
46 export async function runIndex(options = {}) {
47 const log = options.log || ((msg) => console.error(msg));
48 const emit = options.onProgress;
49 const notesProgress = emit ? createThrottledProgress(emit, PROGRESS_ITEM_STEP, PROGRESS_MS) : null;
50 const embedProgress = emit ? createThrottledProgress(emit, PROGRESS_ITEM_STEP, PROGRESS_MS) : null;
51 const upsertProgress = emit ? createThrottledProgress(emit, PROGRESS_ITEM_STEP, PROGRESS_MS) : null;
52
53 const config = loadConfig();
54 const vaultPath = options.vaultPath ?? config.vault_path;
55 const vaultId = options.vaultId ?? 'default';
56 const storeType = config.vector_store || 'qdrant';
57 if (storeType === 'qdrant' && !config.qdrant_url) {
58 throw new Error('qdrant_url is required for indexing when using Qdrant. Set in config/local.yaml or QDRANT_URL.');
59 }
60 if (storeType !== 'qdrant' && storeType !== 'sqlite-vec') {
61 throw new Error(`Vector store "${storeType}" is not implemented. Use vector_store: qdrant or sqlite-vec.`);
62 }
63
64 const paths = listMarkdownFiles(vaultPath, { ignore: config.ignore });
65 log(`Vault: ${vaultPath} (${vaultId}); ${paths.length} note(s) to index.`);
66
67 const allChunks = [];
68 let noteIndex = 0;
69 for (const relPath of paths) {
70 try {
71 const note = readNote(vaultPath, relPath);
72 const opts = {
73 chunkSize: config.indexer?.chunk_size ?? 2048,
74 chunkOverlap: config.indexer?.chunk_overlap ?? 256,
75 };
76 const chunks = chunkNote(note, opts);
77 for (const c of chunks) {
78 allChunks.push(c);
79 }
80 } catch (e) {
81 log(`Skip ${relPath}: ${e.message}`);
82 }
83 noteIndex += 1;
84 await notesProgress?.(
85 noteIndex,
86 paths.length,
87 `Chunking notes ${noteIndex}/${paths.length}`,
88 noteIndex === 1 || noteIndex === paths.length
89 );
90 }
91
92 if (allChunks.length === 0) {
93 log('No chunks to index.');
94 const store = await createVectorStore(config);
95 const dim = embeddingDimension(config.embedding);
96 await store.ensureCollection(dim);
97 await embedProgress?.(0, 0, 'No chunks to embed', true);
98 await upsertProgress?.(0, 0, 'Nothing to upsert', true);
99 return { notesProcessed: paths.length, chunksIndexed: 0 };
100 }
101
102 log(`Embedding ${allChunks.length} chunk(s) with ${config.embedding?.provider || 'ollama'}/${config.embedding?.model || 'nomic-embed-text'}...`);
103 const vectors = [];
104 for (let i = 0; i < allChunks.length; i += BATCH_EMBED) {
105 const batch = allChunks.slice(i, i + BATCH_EMBED);
106 const texts = batch.map((c) => c.text);
107 const batchVectors = await embed(texts, config.embedding, { voyageInputType: 'document' });
108 for (let j = 0; j < batch.length; j++) {
109 vectors.push(batchVectors[j] || []);
110 }
111 const done = Math.min(i + BATCH_EMBED, allChunks.length);
112 await embedProgress?.(
113 done,
114 allChunks.length,
115 `Embedding chunks ${done}/${allChunks.length}`,
116 done === allChunks.length || i === 0
117 );
118 if (i + BATCH_EMBED < allChunks.length) {
119 log(` embedded ${done}/${allChunks.length}`);
120 }
121 }
122
123 const dim = embeddingDimension(config.embedding);
124 const store = await createVectorStore(config);
125 await store.ensureCollection(dim);
126
127 for (let i = 0; i < allChunks.length; i += BATCH_UPSERT) {
128 const batch = allChunks.slice(i, i + BATCH_UPSERT);
129 const points = batch.map((chunk, j) => ({
130 id: `${vaultId}::${chunk.id}`,
131 vector: vectors[i + j] || [],
132 text: chunk.text,
133 path: chunk.path,
134 vault_id: vaultId,
135 project: chunk.project,
136 tags: chunk.tags,
137 date: chunk.date,
138 causal_chain_id: chunk.causal_chain_id,
139 entity: chunk.entity,
140 episode_id: chunk.episode_id,
141 }));
142 await store.upsert(points);
143 const done = Math.min(i + BATCH_UPSERT, allChunks.length);
144 await upsertProgress?.(
145 done,
146 allChunks.length,
147 `Upserting chunks ${done}/${allChunks.length}`,
148 done === allChunks.length || i === 0
149 );
150 log(` upserted ${done}/${allChunks.length}`);
151 }
152
153 log(`Done. ${paths.length} note(s), ${allChunks.length} chunk(s) indexed.`);
154 return { notesProcessed: paths.length, chunksIndexed: allChunks.length };
155 }
File History 1 commit
sha256:41d741fb345c4abdb640838aa3d847de02ccffd7a39fce04894e743e683b50d0 fix(security): pin patched transitive deps to clear Dependa… Human minor 7 days ago