memory-consolidate.mjs
551 lines 19.4 KB
Raw
sha256:0d530f9ef27b8b75547d1db7701a74bc77b77aa8f3d7fa3a8672cf2af36e63bb reconcile: import GitHub-direct RBAC/OAuth/companion and ho… Human minor ⚠ breaking 6 hours ago
1 /**
2 * Core consolidation engine: reads recent memory events, groups by topic,
3 * sends each group to an LLM for deduplication/merging, and stores the result
4 * as consolidation events. Phase A of the Daemon Consolidation Spec.
5 *
6 * Phase C adds runVerifyPass (Pass 2: Stale Reference Detection).
7 * Phase D adds runDiscoverPass (Pass 3: Relationship Discovery).
8 *
9 * Phase 6 migration (D6.6.2): runDiscoverPass routes insight persistence through
10 * DerivedArtifactWriter. The direct mm.store('insight', ...) call is removed.
11 *
12 * This module is a pure function library with no daemon lifecycle logic.
13 * It can be invoked manually via CLI or MCP.
14 */
15
16 import { extractTopicFromEvent } from './memory-event.mjs';
17 import { createMemoryManager, verifyMemoryEvent } from './memory.mjs';
18 import { completeChat } from './llm-complete.mjs';
19 import { createDerivedArtifactWriter } from './companion-artifact-writer.mjs';
20 import { buildConvenienceProvenance } from './companion-provenance-validator.mjs';
21
22 const CONSOLIDATION_SYSTEM_PROMPT = `You are a memory consolidation engine for a personal knowledge vault.
23 You receive a batch of timestamped activity events on a single topic.
24 Your job:
25 1. Merge redundant observations into single factual statements.
26 2. When events contradict each other, keep the most recent fact and discard the older one.
27 3. Distill the batch into 3-7 concise, factual statements.
28 4. Each statement must be a complete, standalone fact (no "as mentioned earlier").
29 5. Preserve note paths and dates when they add context.
30
31 Output format: JSON array of strings, one per fact. No commentary.`;
32
33 /**
34 * Build a user prompt for the consolidation LLM call from a topic group.
35 * When options.encrypt is true (memory.encrypt), raw event payloads are omitted from the LLM prompt.
36 * Exported for testing.
37 *
38 * @param {string} topic
39 * @param {object[]} events
40 * @param {{ encrypt?: boolean }} [options]
41 * @returns {string}
42 */
43 export function buildConsolidationPrompt(topic, events, options = {}) {
44 const encrypt = options.encrypt === true;
45 const lines = events.map((e) => {
46 if (encrypt) {
47 return `[${e.ts}] ${e.type} (event payload omitted — encrypted memory mode)`;
48 }
49 const summary = JSON.stringify(e.data).slice(0, 300);
50 return `[${e.ts}] ${e.type}: ${summary}`;
51 });
52 return `Topic: "${topic}"\nEvents (${events.length}):\n${lines.join('\n')}`;
53 }
54
55 /**
56 * Parse the LLM response into an array of fact strings.
57 * Handles common quirks: markdown code fences, trailing text, invalid JSON.
58 * Exported for testing.
59 *
60 * @param {string} raw — raw LLM output
61 * @returns {string[]}
62 */
63 export function parseConsolidationResponse(raw) {
64 if (!raw || typeof raw !== 'string') return [];
65
66 let cleaned = raw.trim();
67 const fenceMatch = cleaned.match(/```(?:json)?\s*([\s\S]*?)```/);
68 if (fenceMatch) cleaned = fenceMatch[1].trim();
69
70 try {
71 const parsed = JSON.parse(cleaned);
72 if (Array.isArray(parsed)) {
73 return parsed.filter((item) => typeof item === 'string' && item.trim()).map((s) => s.trim());
74 }
75 return [];
76 } catch (_) {
77 const lines = cleaned.split('\n')
78 .map((l) => l.replace(/^[\s\-*\d.]+/, '').trim())
79 .filter((l) => l.length > 0 && !l.startsWith('{') && !l.startsWith('['));
80 return lines.length > 0 ? lines : [];
81 }
82 }
83
84 const DISCOVER_SYSTEM_PROMPT = `You are an insight engine for a personal knowledge vault.
85 Given topic summaries from the vault's memory, identify:
86 1. Connections between topics that the user might not have noticed.
87 2. Contradictions between topics (a fact in one topic conflicts with another).
88 3. Open questions — things the vault seems to be exploring but hasn't resolved.
89
90 Be concise. Each item should be 1-2 sentences.
91
92 Output format: JSON object with three arrays: "connections", "contradictions", "open_questions".`;
93
94 /**
95 * Build a user prompt for the relationship discovery LLM call from an array
96 * of consolidation events. Each consolidation must have data.topic and data.facts.
97 * Exported for testing.
98 *
99 * When encrypt is false, facts are included in the block.
100 * When encrypt is true, only topic names are sent (no fact content).
101 *
102 * @param {object[]} consolidations — consolidation events (each has data.topic, data.facts)
103 * @param {boolean} [encrypt]
104 * @returns {string}
105 */
106 export function buildDiscoverPrompt(consolidations, encrypt = false) {
107 const blocks = consolidations.map((c) => {
108 const data = c.data ?? c;
109 const topic = typeof data.topic === 'string' ? data.topic : 'unknown';
110 if (encrypt) {
111 return `Topic: "${topic}"`;
112 }
113 const facts = Array.isArray(data.facts) && data.facts.length > 0
114 ? data.facts.map((f) => ` - ${f}`).join('\n')
115 : ' (no facts)';
116 return `Topic: "${topic}"\n${facts}`;
117 });
118 return `Topic summaries:\n${blocks.join('\n\n')}`;
119 }
120
121 /**
122 * Parse the LLM response into { connections, contradictions, open_questions }.
123 * Handles markdown code fences, invalid JSON, and partial objects (missing keys
124 * default to empty arrays). Exported for testing.
125 *
126 * @param {string} raw — raw LLM output
127 * @returns {{ connections: string[], contradictions: string[], open_questions: string[] }}
128 */
129 export function parseDiscoverResponse(raw) {
130 const empty = { connections: [], contradictions: [], open_questions: [] };
131 if (!raw || typeof raw !== 'string') return empty;
132
133 let cleaned = raw.trim();
134 const fenceMatch = cleaned.match(/```(?:json)?\s*([\s\S]*?)```/);
135 if (fenceMatch) cleaned = fenceMatch[1].trim();
136
137 let parsed;
138 try {
139 parsed = JSON.parse(cleaned);
140 } catch (_) {
141 return empty;
142 }
143
144 if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) return empty;
145
146 const toStringArray = (v) => {
147 if (!Array.isArray(v)) return [];
148 return v.filter((item) => typeof item === 'string' && item.trim()).map((s) => s.trim());
149 };
150
151 return {
152 connections: toStringArray(parsed.connections),
153 contradictions: toStringArray(parsed.contradictions),
154 open_questions: toStringArray(parsed.open_questions),
155 };
156 }
157
158 /**
159 * Group events by their extracted topic slug.
160 * Exported for testing.
161 *
162 * @param {object[]} events
163 * @returns {Map<string, object[]>}
164 */
165 export function groupEventsByTopic(events) {
166 const groups = new Map();
167 for (const event of events) {
168 const topic = extractTopicFromEvent(event);
169 if (!groups.has(topic)) groups.set(topic, []);
170 groups.get(topic).push(event);
171 }
172 return groups;
173 }
174
175 /**
176 * Extract all unique path references from a memory event's data payload.
177 *
178 * Always extracts data.path (single path string).
179 * When encrypt is false, also expands data.paths arrays for full coverage.
180 * Exported for testing.
181 *
182 * @param {object} data — event.data
183 * @param {boolean} [encrypt] — if true, skip data.paths (content is opaque)
184 * @returns {string[]} unique, non-empty path strings
185 */
186 export function extractPathsFromEventData(data, encrypt = false) {
187 if (!data || typeof data !== 'object') return [];
188 const seen = new Set();
189 const paths = [];
190
191 const add = (p) => {
192 if (typeof p === 'string' && p.trim() && !seen.has(p)) {
193 seen.add(p);
194 paths.push(p);
195 }
196 };
197
198 add(data.path);
199
200 if (!encrypt && Array.isArray(data.paths)) {
201 for (const p of data.paths) add(p);
202 }
203
204 return paths;
205 }
206
207 /**
208 * Resolve the list of pass names to run from the caller's opts.passes value
209 * and the daemon config.
210 *
211 * opts.passes may be:
212 * - string[] — explicit pass names, e.g. ['consolidate', 'verify']
213 * - string — comma-separated, e.g. 'consolidate,verify'
214 * - undefined/null — fall back to daemon config defaults
215 *
216 * @param {string[]|string|null|undefined} passesOpt
217 * @param {object} [daemonPassesCfg] — daemon.passes section from config
218 * @returns {string[]}
219 */
220 export function resolvePassNames(passesOpt, daemonPassesCfg) {
221 if (Array.isArray(passesOpt)) {
222 return passesOpt.map((s) => String(s).trim()).filter(Boolean);
223 }
224 if (typeof passesOpt === 'string') {
225 return passesOpt.split(',').map((s) => s.trim()).filter(Boolean);
226 }
227 const dp = daemonPassesCfg && typeof daemonPassesCfg === 'object' ? daemonPassesCfg : {};
228 const names = [];
229 if (dp.consolidate !== false) names.push('consolidate');
230 if (dp.verify !== false) names.push('verify');
231 if (dp.discover === true) names.push('discover');
232 return names;
233 }
234
235 /**
236 * Run Pass 2: Stale Reference Detection.
237 *
238 * Scans the provided events for note path references, checks each path against
239 * the vault filesystem, and writes a maintenance event summarising stale and
240 * verified paths (unless dryRun: true). Reuses verifyMemoryEvent for all
241 * per-path filesystem checks.
242 *
243 * Classification per path:
244 * 'verified' — file exists and was not modified after the event timestamp
245 * 'stale' — file is missing or was modified after the event timestamp
246 * 'no_ref' — event has no path reference (not counted in checked_count)
247 *
248 * @param {object} config — loadConfig() result
249 * @param {object[]} events — memory events to scan (already read by caller)
250 * @param {{ dryRun?: boolean }} [opts]
251 * @returns {{ stale_paths: string[], verified_paths: string[], checked_count: number, dry_run: boolean }}
252 */
253 export function runVerifyPass(config, events, opts = {}) {
254 const dryRun = opts.dryRun ?? false;
255 const encrypt = config.memory?.encrypt === true;
256
257 const stalePaths = new Set();
258 const verifiedPaths = new Set();
259 let checked_count = 0;
260
261 for (const event of events) {
262 const paths = extractPathsFromEventData(event.data, encrypt);
263 if (paths.length === 0) continue;
264
265 checked_count++;
266
267 for (const refPath of paths) {
268 // Synthetic event: override data to isolate this path; force status 'success'
269 // so verifyMemoryEvent performs the filesystem check rather than short-circuiting.
270 const syntheticEvent = { ...event, status: 'success', data: { path: refPath } };
271 const { confidence } = verifyMemoryEvent(config, syntheticEvent);
272
273 if (confidence === 'stale') {
274 stalePaths.add(refPath);
275 } else if (confidence === 'verified') {
276 verifiedPaths.add(refPath);
277 }
278 // 'hint' (no vault_path configured, or filesystem error) — skip; cannot classify
279 }
280 }
281
282 const stale_paths = [...stalePaths];
283 const verified_paths = [...verifiedPaths];
284
285 if (!dryRun) {
286 const mm = opts.mm ?? createMemoryManager(config);
287 mm.store('maintenance', { stale_paths, verified_paths, checked_count });
288 }
289
290 return { stale_paths, verified_paths, checked_count, dry_run: dryRun };
291 }
292
293 /**
294 * Run Pass 3: Relationship Discovery.
295 *
296 * Reads the current consolidation events (already provided by the caller — these
297 * are the events written during the current consolidation pass), builds a topic
298 * summaries payload, sends it to the LLM, and stores the result as an insight event.
299 *
300 * Phase 6 migration (D6.6.2): insight persistence now routes through the
301 * DerivedArtifactWriter. The direct mm.store('insight', ...) call is removed.
302 * For the convenience/self-partition default the behavior is unchanged.
303 *
304 * When config.memory.encrypt is true, only topic names are sent to the LLM (no facts).
305 * When dryRun is true, no LLM call or event write is made.
306 *
307 * @param {object} config — loadConfig() result
308 * @param {object[]} consolidations — consolidation events to discover across
309 * (each must have data.topic and data.facts)
310 * @param {{
311 * dryRun?: boolean,
312 * llmFn?: Function,
313 * mm?: import('./memory.mjs').MemoryManager,
314 * writer?: import('./companion-artifact-writer.mjs').DerivedArtifactWriter,
315 * writerContext?: import('./companion-artifact-writer.mjs').WriteContext,
316 * generatedBy?: string,
317 * model?: string,
318 * modelVersion?: string,
319 * runtimeVersion?: string,
320 * }} [opts]
321 * @returns {Promise<{ connections: string[], contradictions: string[], open_questions: string[], topic_count: number, dry_run: boolean }>}
322 */
323 export async function runDiscoverPass(config, consolidations, opts = {}) {
324 const dryRun = opts.dryRun ?? false;
325 const encrypt = config.memory?.encrypt === true;
326 const maxTokens = config.daemon?.llm?.max_tokens ?? 1024;
327 const llmFn = opts.llmFn || completeChat;
328
329 const topic_count = consolidations.length;
330
331 if (dryRun) {
332 return { connections: [], contradictions: [], open_questions: [], topic_count, dry_run: true };
333 }
334
335 const userPrompt = buildDiscoverPrompt(consolidations, encrypt);
336
337 let parsed;
338 try {
339 const rawResponse = await llmFn(config, {
340 system: DISCOVER_SYSTEM_PROMPT,
341 user: userPrompt,
342 maxTokens,
343 });
344 parsed = parseDiscoverResponse(rawResponse);
345 } catch (_) {
346 parsed = { connections: [], contradictions: [], open_questions: [] };
347 }
348
349 const { connections, contradictions, open_questions } = parsed;
350
351 // Phase 6 (D6.6.2): route through DerivedArtifactWriter — no direct mm.store('insight', ...).
352 const mm = opts.mm ?? createMemoryManager(config);
353
354 // Build writer + provenance for convenience/self-partition default.
355 const writer = opts.writer ?? createDerivedArtifactWriter({
356 writeNoteFn: () => { /* insight artifacts don't use writeNoteFn */ },
357 vaultPath: config.vault_path || '.',
358 mm,
359 vaultRegistryAvailable: false,
360 });
361
362 const context = opts.writerContext ?? {
363 lane: 'local',
364 containsPrivateData: false,
365 isDelegate: false,
366 delegatedManagedAllowed: false,
367 enrichesDelegatedPartition: false,
368 delegatedEnrichmentAllowed: false,
369 };
370
371 // source_event_id: collect consolidation event ids if available; fall back to timestamp slug
372 const sourceEventIds = consolidations
373 .map((c) => c.id)
374 .filter((id) => typeof id === 'string' && id);
375 const sourceEventId = sourceEventIds.length > 0
376 ? sourceEventIds
377 : [`discover-${Date.now()}`];
378
379 const generatedBy = opts.generatedBy || config.vault_id || 'system';
380 const model = opts.model || config.llm?.model || 'unknown';
381 // D6.2.1: one of model_version|runtime_version MUST be a concrete value.
382 // When no version is configured, use 'unknown' rather than null to satisfy this.
383 const modelVersion = opts.modelVersion || config.llm?.model_version || 'unknown';
384 const runtimeVersion = opts.runtimeVersion || null;
385
386 const provenance = buildConvenienceProvenance({
387 generatedBy,
388 source: 'companion',
389 model,
390 modelVersion,
391 runtimeVersion: runtimeVersion ?? undefined,
392 lane: context.lane,
393 artifactType: 'insight',
394 sourceNotePath: null,
395 sourceEventId,
396 });
397
398 const artifact = { connections, contradictions, open_questions, topic_count };
399
400 await writer.write(artifact, provenance, context);
401
402 return { connections, contradictions, open_questions, topic_count, dry_run: false };
403 }
404
405 /**
406 * Run the consolidation engine: read recent events, optionally group by topic
407 * and call LLM (consolidate pass), optionally detect stale path references
408 * (verify pass), store results, and rebuild the pointer index.
409 *
410 * opts.passes controls which passes run:
411 * - undefined/null → use daemon config (consolidate + verify by default)
412 * - string[] → explicit list, e.g. ['consolidate', 'verify']
413 * - comma-string → e.g. 'consolidate,verify'
414 *
415 * @param {object} config — loadConfig() result
416 * @param {{ dryRun?: boolean, passes?: string[]|string, lookbackHours?: number, maxEventsPerPass?: number, maxTopicsPerPass?: number, llmFn?: Function }} [opts]
417 * @returns {Promise<{ topics: Array<{ topic: string, event_count: number, facts: string[], id?: string }>, total_events: number, dry_run: boolean, verify: object|null, discover: object|null }>}
418 */
419 export async function consolidateMemory(config, opts = {}) {
420 const daemonCfg = config.daemon || {};
421 const dryRun = opts.dryRun ?? daemonCfg.dry_run ?? false;
422 const lookbackHours = opts.lookbackHours ?? daemonCfg.lookback_hours ?? 24;
423 const maxEventsPerPass = opts.maxEventsPerPass ?? daemonCfg.max_events_per_pass ?? 200;
424 const maxTopicsPerPass = opts.maxTopicsPerPass ?? daemonCfg.max_topics_per_pass ?? 10;
425 const maxTokens = daemonCfg.llm?.max_tokens ?? 1024;
426
427 const llmFn = opts.llmFn || completeChat;
428 const consolidateEncrypt = config.memory?.encrypt === true;
429
430 const passNames = resolvePassNames(opts.passes, daemonCfg.passes);
431 const runConsolidate = passNames.includes('consolidate');
432 const runVerify = passNames.includes('verify');
433 const runDiscover = passNames.includes('discover');
434
435 const mm = opts.mm ?? createMemoryManager(config);
436 const since = new Date(Date.now() - lookbackHours * 3_600_000).toISOString();
437 const allEvents = mm.list({ since, limit: maxEventsPerPass });
438
439 const nonConsolidationEvents = allEvents.filter(
440 (e) => e.type !== 'consolidation' && e.type !== 'maintenance' && e.type !== 'insight',
441 );
442
443 if (nonConsolidationEvents.length === 0) {
444 return { topics: [], total_events: 0, dry_run: dryRun, verify: null, discover: null };
445 }
446
447 const results = [];
448
449 if (runConsolidate) {
450 const topicGroups = groupEventsByTopic(nonConsolidationEvents);
451
452 const sortedTopics = [...topicGroups.entries()]
453 .sort((a, b) => b[1].length - a[1].length)
454 .slice(0, maxTopicsPerPass);
455
456 for (const [topic, events] of sortedTopics) {
457 if (events.length < 2) continue;
458
459 const userPrompt = buildConsolidationPrompt(topic, events, { encrypt: consolidateEncrypt });
460
461 if (dryRun) {
462 results.push({
463 topic,
464 event_count: events.length,
465 facts: [],
466 dry_run_estimate: `${Math.min(events.length, 7)} facts`,
467 });
468 continue;
469 }
470
471 let facts;
472 try {
473 const rawResponse = await llmFn(config, {
474 system: CONSOLIDATION_SYSTEM_PROMPT,
475 user: userPrompt,
476 maxTokens,
477 });
478 facts = parseConsolidationResponse(rawResponse);
479 } catch (err) {
480 results.push({
481 topic,
482 event_count: events.length,
483 facts: [],
484 error: err.message || String(err),
485 });
486 continue;
487 }
488
489 if (facts.length === 0) {
490 results.push({
491 topic,
492 event_count: events.length,
493 facts: [],
494 error: 'LLM returned no parseable facts',
495 });
496 continue;
497 }
498
499 const timestamps = events.map((e) => e.ts).sort();
500 const consolidationData = {
501 topic,
502 facts,
503 event_count: events.length,
504 since: timestamps[0],
505 until: timestamps[timestamps.length - 1],
506 };
507
508 const stored = mm.store('consolidation', consolidationData);
509 results.push({
510 topic,
511 event_count: events.length,
512 facts,
513 id: stored.id,
514 });
515 }
516 }
517
518 let verifyResult = null;
519 if (runVerify) {
520 verifyResult = runVerifyPass(config, nonConsolidationEvents, { dryRun, mm });
521 }
522
523 // Collect consolidation events that were actually stored this pass (non-dry-run, non-error).
524 // These are passed to the discover pass so it doesn't re-read from disk.
525 const storedConsolidations = results.filter((r) => r.id != null);
526
527 let discoverResult = null;
528 if (runDiscover) {
529 if (storedConsolidations.length === 0) {
530 // No consolidations written (dry-run, consolidate pass skipped, or no qualifying groups).
531 discoverResult = null;
532 } else {
533 const consolidationEvents = storedConsolidations.map((r) => ({
534 data: { topic: r.topic, facts: r.facts },
535 }));
536 discoverResult = await runDiscoverPass(config, consolidationEvents, { dryRun, llmFn, mm });
537 }
538 }
539
540 if (!dryRun) {
541 mm.generateIndex({ force: true });
542 }
543
544 return {
545 topics: results,
546 total_events: nonConsolidationEvents.length,
547 dry_run: dryRun,
548 verify: verifyResult,
549 discover: discoverResult,
550 };
551 }
File History 1 commit
sha256:0d530f9ef27b8b75547d1db7701a74bc77b77aa8f3d7fa3a8672cf2af36e63bb reconcile: import GitHub-direct RBAC/OAuth/companion and ho… Human minor 6 hours ago