cluster-semantic.mjs
87 lines 2.9 KB
Raw
sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd feat(calendar): enforce agent context tiers in retrieval AP… Human minor ⚠ breaking 1 day ago
1 /**
2 * Semantic clustering over notes (Issue #1 Phase C8).
3 * Embeds truncated note text (up to 200 notes) and runs k-means — works with Qdrant or sqlite-vec without reading raw vectors from the store.
4 */
5
6 import { loadConfig } from './config.mjs';
7 import { listMarkdownFiles, readNote, normalizeSlug } from './vault.mjs';
8 import { embed } from './embedding.mjs';
9 import { kmeans } from './kmeans.mjs';
10
11 const MAX_NOTES = 200;
12 const TEXT_SLICE = 800;
13
14 /**
15 * @param {{ project?: string, folder?: string, n_clusters?: number }} options
16 */
17 export async function runCluster(options = {}) {
18 const config = loadConfig();
19 const k = Math.max(2, Math.min(options.n_clusters ?? 5, 15));
20 let paths = listMarkdownFiles(config.vault_path, { ignore: config.ignore });
21
22 if (options.folder) {
23 const prefix = options.folder.replace(/\\/g, '/').replace(/\/$/, '') + '/';
24 const exact = options.folder.replace(/\\/g, '/').replace(/\/$/, '');
25 paths = paths.filter((p) => {
26 const n = p.replace(/\\/g, '/');
27 return n === exact || n.startsWith(prefix);
28 });
29 }
30
31 const wantProject = options.project != null ? normalizeSlug(String(options.project)) : null;
32
33 const texts = [];
34 const pathFor = [];
35 for (const p of paths) {
36 if (pathFor.length >= MAX_NOTES) break;
37 try {
38 const note = readNote(config.vault_path, p);
39 if (wantProject && note.project !== wantProject) continue;
40 const t = `${note.frontmatter?.title ? String(note.frontmatter.title) + '\n' : ''}${(note.body || '').slice(0, TEXT_SLICE)}`;
41 if (!t.trim()) continue;
42 texts.push(t);
43 pathFor.push(note.path.replace(/\\/g, '/'));
44 } catch (_) {}
45 }
46
47 if (texts.length < k) {
48 return {
49 clusters: [],
50 note: `Not enough notes (${texts.length}) for k=${k}. Add notes or lower n_clusters.`,
51 };
52 }
53
54 const vectors = await embed(texts, config.embedding || {}, { voyageInputType: 'document' });
55 const points = [];
56 for (let i = 0; i < pathFor.length; i++) {
57 const v = vectors[i];
58 if (!v || !v.length) continue;
59 points.push({ id: pathFor[i], vector: v, path: pathFor[i], text: texts[i] });
60 }
61 if (points.length < k) {
62 return { clusters: [], note: 'Embedding failed for some notes.' };
63 }
64
65 const { labels } = kmeans(
66 points.map((p) => ({ id: p.id, vector: p.vector })),
67 k
68 );
69
70 const clusters = [];
71 for (let c = 0; c < k; c++) {
72 const members = [];
73 for (let i = 0; i < points.length; i++) {
74 if (labels[i] === c) members.push(points[i]);
75 }
76 if (!members.length) continue;
77 const centroidSnippet = (members[0].text || '').slice(0, 120).replace(/\s+/g, ' ').trim();
78 const pathsIn = [...new Set(members.map((m) => m.path))];
79 clusters.push({
80 label: `cluster_${c + 1}`,
81 centroid_snippet: centroidSnippet,
82 paths: pathsIn,
83 });
84 }
85
86 return { clusters, notes_sampled: points.length, max_notes: MAX_NOTES };
87 }
File History 2 commits
sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd feat(calendar): enforce agent context tiers in retrieval AP… Human minor 1 day ago
sha256:9103f98c89257ed2b01c237cea895dabb3e85ea337dccb1161c175e4422355b6 docs: accept Calendar Events v0 spec with Phase 0 security … Human 1 day ago