gabriel / musehub public
test_mist_phase3_snapshot_indexer.py python
490 lines 17.8 KB
Raw
sha256:7d6dd8f4a89e2d1fef2d84f6e65feaff51385d382f466766b7f690a22ec18e32 fix: fall back to DB ancestry check when mpack-only fast-fo… Sonnet 4.6 patch 6 days ago
1 """Phase 3 TDD: Mist snapshot indexer — symbol anchor extraction on push.
2
3 Tests are written RED first. Run before touching musehub_mist_indexer.py
4 and musehub_intel_providers.py to confirm they fail, then implement to green.
5
6 The indexer reads a mist repo's HEAD commit snapshot manifest, loads each
7 artifact's bytes from the object store, extracts symbol anchors, and writes
8 normalized rows to:
9 musehub_symbol_history_entries — one row per (repo_id, address, commit_id)
10 musehub_symbol_intel — one row per (repo_id, address)
11
12 This makes mist anchors discoverable via muse code grep / code impact across
13 the entire hub, using the same infrastructure as code-domain symbols.
14
15 Idempotency: indexing the same commit twice must produce the same row count.
16 """
17 from __future__ import annotations
18
19 import secrets
20 from datetime import datetime, timezone
21
22 import msgpack
23 import pytest
24 from muse.core.types import blob_id
25 from sqlalchemy import func, select
26 from sqlalchemy.ext.asyncio import AsyncSession
27
28 from musehub.core.genesis import compute_identity_id, compute_repo_id
29 from musehub.db.musehub_intel_models import MusehubSymbolHistoryEntry, MusehubSymbolIntel
30 from musehub.db.musehub_repo_models import MusehubCommit, MusehubCommitRef, MusehubObject, MusehubRepo, MusehubSnapshot, MusehubSnapshotRef
31 from musehub.types.json_types import StrDict
32
33
34 # ---------------------------------------------------------------------------
35 # Helpers
36 # ---------------------------------------------------------------------------
37
38 def _now() -> datetime:
39 return datetime.now(tz=timezone.utc)
40
41
42 def _oid(content: bytes) -> str:
43 return blob_id(content)
44
45
46 def _manifest_blob(manifest: StrDict) -> bytes:
47 return msgpack.packb(manifest, use_bin_type=True)
48
49
50 def _commit_id() -> str:
51 return blob_id(secrets.token_bytes(16))
52
53
54 def _snap_id(manifest: StrDict) -> str:
55 return blob_id(msgpack.packb(sorted(manifest.items()), use_bin_type=True))
56
57
58 async def _seed_mist_vcs_repo(
59 session: AsyncSession,
60 *,
61 owner: str = "testuser",
62 artifacts: dict[str, bytes], # filename → raw content bytes
63 ) -> tuple[MusehubRepo, MusehubCommit]:
64 """Create a mist repo with a commit pointing at a snapshot of the given artifacts.
65
66 Each artifact becomes a MusehubObject with content_cache populated so
67 read_object_bytes() can serve it without hitting disk or S3.
68 """
69 owner_id = compute_identity_id(owner.encode())
70 slug = f"mist-{secrets.token_hex(4)}"
71 created_at = _now()
72 repo_id = compute_repo_id(owner_id, slug, "mist", created_at.isoformat())
73
74 repo = MusehubRepo(
75 repo_id=repo_id,
76 name=slug,
77 owner=owner,
78 slug=slug,
79 visibility="public",
80 owner_user_id=owner_id,
81 domain_id="mist",
82 description="",
83 tags=[],
84 created_at=created_at,
85 )
86 session.add(repo)
87 await session.flush()
88
89 # Create MusehubObject rows with content_cache for each artifact.
90 manifest: dict[str, str] = {}
91 for filename, raw in artifacts.items():
92 oid = _oid(raw)
93 manifest[filename] = oid
94 obj = MusehubObject(
95 object_id=oid,
96 path=filename,
97 size_bytes=len(raw),
98 content_cache=raw,
99 )
100 # ON CONFLICT DO NOTHING — same bytes may appear in multiple artifacts.
101 existing = await session.get(MusehubObject, oid)
102 if existing is None:
103 session.add(obj)
104 await session.flush()
105
106 # Create snapshot row.
107 snap_id = _snap_id(manifest)
108 existing_snap = await session.get(MusehubSnapshot, snap_id)
109 if existing_snap is None:
110 session.add(MusehubSnapshot(
111 snapshot_id=snap_id,
112 entry_count=len(manifest),
113 manifest_blob=_manifest_blob(manifest),
114 ))
115 session.add(MusehubSnapshotRef(repo_id=repo_id, snapshot_id=snap_id))
116 await session.flush()
117
118 # Create commit row pointing at the snapshot.
119 cid = _commit_id()
120 commit = MusehubCommit(
121 commit_id=cid,
122 message="initial mist",
123 author=owner,
124 branch="main",
125 parent_ids=[],
126 snapshot_id=snap_id,
127 timestamp=_now(),
128 )
129 session.add(commit)
130 session.add(MusehubCommitRef(repo_id=repo_id, commit_id=cid))
131 await session.commit()
132 await session.refresh(repo)
133 await session.refresh(commit)
134 return repo, commit
135
136
137 # ---------------------------------------------------------------------------
138 # 1. build_mist_anchor_index exists and is importable
139 # ---------------------------------------------------------------------------
140
141 class TestBuildMistAnchorIndexExists:
142 def test_function_is_importable(self) -> None:
143 from musehub.services.musehub_mist_indexer import build_mist_anchor_index
144 import inspect
145 assert inspect.iscoroutinefunction(build_mist_anchor_index)
146
147 def test_function_signature(self) -> None:
148 from musehub.services.musehub_mist_indexer import build_mist_anchor_index
149 import inspect
150 sig = inspect.signature(build_mist_anchor_index)
151 assert "repo_id" in sig.parameters
152 assert "head_commit_id" in sig.parameters
153
154
155 # ---------------------------------------------------------------------------
156 # 2. Anchor extraction → musehub_symbol_history_entries
157 # ---------------------------------------------------------------------------
158
159 class TestMistAnchorIndexerHistoryEntries:
160 @pytest.mark.asyncio
161 async def test_python_artifact_writes_history_entries(
162 self, db_session: AsyncSession, test_user: db.MusehubIdentity
163 ) -> None:
164 from musehub.services.musehub_mist_indexer import build_mist_anchor_index
165
166 repo, commit = await _seed_mist_vcs_repo(
167 db_session,
168 owner=test_user.handle,
169 artifacts={
170 "utils.py": b"def add(a, b):\n return a + b\n\ndef sub(a, b):\n return a - b\n",
171 },
172 )
173
174 await build_mist_anchor_index(db_session, repo.repo_id, commit.commit_id)
175 await db_session.commit()
176
177 rows = (await db_session.execute(
178 select(MusehubSymbolHistoryEntry).where(
179 MusehubSymbolHistoryEntry.repo_id == repo.repo_id,
180 )
181 )).scalars().all()
182
183 addresses = {r.address for r in rows}
184 assert any("add" in a for a in addresses), f"Expected 'add' anchor; got {addresses}"
185 assert any("sub" in a for a in addresses), f"Expected 'sub' anchor; got {addresses}"
186
187 @pytest.mark.asyncio
188 async def test_history_entry_fields(
189 self, db_session: AsyncSession, test_user: db.MusehubIdentity
190 ) -> None:
191 from musehub.services.musehub_mist_indexer import build_mist_anchor_index
192
193 content = b"def process(x):\n return x\n"
194 repo, commit = await _seed_mist_vcs_repo(
195 db_session,
196 owner=test_user.handle,
197 artifacts={"module.py": content},
198 )
199
200 await build_mist_anchor_index(db_session, repo.repo_id, commit.commit_id)
201 await db_session.commit()
202
203 row = (await db_session.execute(
204 select(MusehubSymbolHistoryEntry).where(
205 MusehubSymbolHistoryEntry.repo_id == repo.repo_id,
206 MusehubSymbolHistoryEntry.address.like("module.py::%"),
207 )
208 )).scalars().first()
209
210 assert row is not None
211 assert row.commit_id == commit.commit_id
212 assert row.author == test_user.handle
213 assert row.op in ("add", "modify")
214 assert row.committed_at is not None
215
216 @pytest.mark.asyncio
217 async def test_multiple_artifacts_all_indexed(
218 self, db_session: AsyncSession, test_user: db.MusehubIdentity
219 ) -> None:
220 from musehub.services.musehub_mist_indexer import build_mist_anchor_index
221
222 repo, commit = await _seed_mist_vcs_repo(
223 db_session,
224 owner=test_user.handle,
225 artifacts={
226 "a.py": b"def alpha(): pass\n",
227 "b.py": b"def beta(): pass\n",
228 },
229 )
230
231 await build_mist_anchor_index(db_session, repo.repo_id, commit.commit_id)
232 await db_session.commit()
233
234 rows = (await db_session.execute(
235 select(MusehubSymbolHistoryEntry).where(
236 MusehubSymbolHistoryEntry.repo_id == repo.repo_id,
237 )
238 )).scalars().all()
239
240 addresses = {r.address for r in rows}
241 assert any("alpha" in a for a in addresses)
242 assert any("beta" in a for a in addresses)
243
244 @pytest.mark.asyncio
245 async def test_binary_artifact_produces_no_history_entries(
246 self, db_session: AsyncSession, test_user: db.MusehubIdentity
247 ) -> None:
248 from musehub.services.musehub_mist_indexer import build_mist_anchor_index
249
250 repo, commit = await _seed_mist_vcs_repo(
251 db_session,
252 owner=test_user.handle,
253 artifacts={"image.png": b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00"},
254 )
255
256 await build_mist_anchor_index(db_session, repo.repo_id, commit.commit_id)
257 await db_session.commit()
258
259 count = (await db_session.execute(
260 select(func.count()).select_from(MusehubSymbolHistoryEntry).where(
261 MusehubSymbolHistoryEntry.repo_id == repo.repo_id,
262 )
263 )).scalar_one()
264
265 assert count == 0
266
267
268 # ---------------------------------------------------------------------------
269 # 3. Anchor extraction → musehub_symbol_intel
270 # ---------------------------------------------------------------------------
271
272 class TestMistAnchorIndexerSymbolIntel:
273 @pytest.mark.asyncio
274 async def test_python_artifact_writes_symbol_intel(
275 self, db_session: AsyncSession, test_user: db.MusehubIdentity
276 ) -> None:
277 from musehub.services.musehub_mist_indexer import build_mist_anchor_index
278
279 repo, commit = await _seed_mist_vcs_repo(
280 db_session,
281 owner=test_user.handle,
282 artifacts={"calc.py": b"def mul(a, b):\n return a * b\n"},
283 )
284
285 await build_mist_anchor_index(db_session, repo.repo_id, commit.commit_id)
286 await db_session.commit()
287
288 rows = (await db_session.execute(
289 select(MusehubSymbolIntel).where(
290 MusehubSymbolIntel.repo_id == repo.repo_id,
291 )
292 )).scalars().all()
293
294 assert len(rows) >= 1
295 addresses = {r.address for r in rows}
296 assert any("mul" in a for a in addresses)
297
298 @pytest.mark.asyncio
299 async def test_symbol_intel_churn_is_at_least_one(
300 self, db_session: AsyncSession, test_user: db.MusehubIdentity
301 ) -> None:
302 from musehub.services.musehub_mist_indexer import build_mist_anchor_index
303
304 repo, commit = await _seed_mist_vcs_repo(
305 db_session,
306 owner=test_user.handle,
307 artifacts={"api.py": b"async def fetch(url):\n pass\n"},
308 )
309
310 await build_mist_anchor_index(db_session, repo.repo_id, commit.commit_id)
311 await db_session.commit()
312
313 row = (await db_session.execute(
314 select(MusehubSymbolIntel).where(
315 MusehubSymbolIntel.repo_id == repo.repo_id,
316 MusehubSymbolIntel.address.like("api.py::%"),
317 )
318 )).scalars().first()
319
320 assert row is not None
321 assert row.churn >= 1
322
323
324 # ---------------------------------------------------------------------------
325 # 4. Idempotency
326 # ---------------------------------------------------------------------------
327
328 class TestMistAnchorIndexerIdempotency:
329 @pytest.mark.asyncio
330 async def test_indexing_same_commit_twice_is_idempotent(
331 self, db_session: AsyncSession, test_user: db.MusehubIdentity
332 ) -> None:
333 from musehub.services.musehub_mist_indexer import build_mist_anchor_index
334
335 repo, commit = await _seed_mist_vcs_repo(
336 db_session,
337 owner=test_user.handle,
338 artifacts={"ops.py": b"def create(): pass\ndef delete(): pass\n"},
339 )
340
341 for _ in range(2):
342 await build_mist_anchor_index(db_session, repo.repo_id, commit.commit_id)
343 await db_session.commit()
344
345 history_count = (await db_session.execute(
346 select(func.count()).select_from(MusehubSymbolHistoryEntry).where(
347 MusehubSymbolHistoryEntry.repo_id == repo.repo_id,
348 )
349 )).scalar_one()
350
351 intel_count = (await db_session.execute(
352 select(func.count()).select_from(MusehubSymbolIntel).where(
353 MusehubSymbolIntel.repo_id == repo.repo_id,
354 )
355 )).scalar_one()
356
357 assert history_count == intel_count, (
358 "Each anchor should produce exactly one history entry and one intel row"
359 )
360 # Verify rows are present (not zero from double-delete or something)
361 assert history_count >= 2, f"Expected ≥2 anchors for create+delete; got {history_count}"
362
363
364 # ---------------------------------------------------------------------------
365 # 5. Edge cases
366 # ---------------------------------------------------------------------------
367
368 class TestMistAnchorIndexerEdgeCases:
369 @pytest.mark.asyncio
370 async def test_commit_without_snapshot_returns_empty(
371 self, db_session: AsyncSession, test_user: db.MusehubIdentity
372 ) -> None:
373 from musehub.services.musehub_mist_indexer import build_mist_anchor_index
374 from musehub.core.genesis import compute_repo_id
375
376 owner_id = compute_identity_id(test_user.handle.encode())
377 created_at = _now()
378 repo_id = compute_repo_id(owner_id, "no-snap", "mist", created_at.isoformat())
379 repo = MusehubRepo(
380 repo_id=repo_id, name="no-snap", owner=test_user.handle,
381 slug="no-snap", visibility="public", owner_user_id=owner_id,
382 domain_id="mist", description="", tags=[], created_at=created_at,
383 )
384 db_session.add(repo)
385
386 cid = _commit_id()
387 commit = MusehubCommit(
388 commit_id=cid, message="empty",
389 author=test_user.handle, branch="main", parent_ids=[],
390 snapshot_id=None, timestamp=_now(),
391 )
392 db_session.add(commit)
393 db_session.add(MusehubCommitRef(repo_id=repo_id, commit_id=cid))
394 await db_session.commit()
395
396 result = await build_mist_anchor_index(db_session, repo_id, cid)
397 assert result == []
398
399 @pytest.mark.asyncio
400 async def test_object_missing_from_store_is_skipped(
401 self, db_session: AsyncSession, test_user: db.MusehubIdentity
402 ) -> None:
403 """Object_id in manifest but no MusehubObject row → skip gracefully."""
404 from musehub.services.musehub_mist_indexer import build_mist_anchor_index
405
406 owner_id = compute_identity_id(test_user.handle.encode())
407 created_at = _now()
408 repo_id = compute_repo_id(owner_id, "ghost-obj", "mist", created_at.isoformat())
409 repo = MusehubRepo(
410 repo_id=repo_id, name="ghost-obj", owner=test_user.handle,
411 slug="ghost-obj", visibility="public", owner_user_id=owner_id,
412 domain_id="mist", description="", tags=[], created_at=created_at,
413 )
414 db_session.add(repo)
415 await db_session.flush()
416
417 ghost_oid = blob_id(b"ghost content that has no DB row")
418 manifest = {"ghost.py": ghost_oid}
419 snap_id = _snap_id(manifest)
420 db_session.add(MusehubSnapshot(
421 snapshot_id=snap_id, entry_count=1,
422 manifest_blob=_manifest_blob(manifest),
423 ))
424 db_session.add(MusehubSnapshotRef(repo_id=repo_id, snapshot_id=snap_id))
425 await db_session.flush()
426
427 cid = _commit_id()
428 commit = MusehubCommit(
429 commit_id=cid, message="ghost",
430 author=test_user.handle, branch="main", parent_ids=[],
431 snapshot_id=snap_id, timestamp=_now(),
432 )
433 db_session.add(commit)
434 db_session.add(MusehubCommitRef(repo_id=repo_id, commit_id=cid))
435 await db_session.commit()
436
437 # Must not raise — silently skips the missing object.
438 result = await build_mist_anchor_index(db_session, repo_id, cid)
439 assert isinstance(result, list)
440
441 @pytest.mark.asyncio
442 async def test_returns_intel_result_tuple(
443 self, db_session: AsyncSession, test_user: db.MusehubIdentity
444 ) -> None:
445 from musehub.services.musehub_mist_indexer import build_mist_anchor_index
446
447 repo, commit = await _seed_mist_vcs_repo(
448 db_session,
449 owner=test_user.handle,
450 artifacts={"result.py": b"def answer(): return 42\n"},
451 )
452
453 result = await build_mist_anchor_index(db_session, repo.repo_id, commit.commit_id)
454
455 assert len(result) == 1
456 intel_type, data = result[0]
457 assert intel_type == "mist.anchor_index"
458 assert "anchor_count" in data
459 assert data["anchor_count"] >= 1
460
461
462 # ---------------------------------------------------------------------------
463 # 6. MistProvider delegates to build_mist_anchor_index
464 # ---------------------------------------------------------------------------
465
466 class TestMistProviderDelegatesToIndexer:
467 @pytest.mark.asyncio
468 async def test_mist_provider_writes_normalized_rows(
469 self, db_session: AsyncSession, test_user: db.MusehubIdentity
470 ) -> None:
471 """MistProvider.compute triggers the normalized indexer for VCS-backed mists."""
472 from musehub.services.musehub_intel_providers import _PROVIDER_REGISTRY
473
474 provider = _PROVIDER_REGISTRY["intel.mist"]
475 repo, commit = await _seed_mist_vcs_repo(
476 db_session,
477 owner=test_user.handle,
478 artifacts={"svc.py": b"class Service:\n def run(self): pass\n"},
479 )
480
481 await provider.compute(db_session, repo.repo_id, commit.commit_id, {})
482 await db_session.commit()
483
484 rows = (await db_session.execute(
485 select(MusehubSymbolHistoryEntry).where(
486 MusehubSymbolHistoryEntry.repo_id == repo.repo_id,
487 )
488 )).scalars().all()
489
490 assert len(rows) >= 1, "MistProvider must write normalized symbol history entries"
File History 1 commit
sha256:7d6dd8f4a89e2d1fef2d84f6e65feaff51385d382f466766b7f690a22ec18e32 fix: fall back to DB ancestry check when mpack-only fast-fo… Sonnet 4.6 patch 6 days ago