test_mist_phase3_snapshot_indexer.py
file-level
1
files
1
commits
0
hotspots
0
🧊 dead
0
💥 blast risk
| 1 | """Phase 3 TDD: Mist snapshot indexer — symbol anchor extraction on push. |
| 2 | |
| 3 | Tests are written RED first. Run before touching musehub_mist_indexer.py |
| 4 | and musehub_intel_providers.py to confirm they fail, then implement to green. |
| 5 | |
| 6 | The indexer reads a mist repo's HEAD commit snapshot manifest, loads each |
| 7 | artifact's bytes from the object store, extracts symbol anchors, and writes |
| 8 | normalized rows to: |
| 9 | musehub_symbol_history_entries — one row per (repo_id, address, commit_id) |
| 10 | musehub_symbol_intel — one row per (repo_id, address) |
| 11 | |
| 12 | This makes mist anchors discoverable via muse code grep / code impact across |
| 13 | the entire hub, using the same infrastructure as code-domain symbols. |
| 14 | |
| 15 | Idempotency: indexing the same commit twice must produce the same row count. |
| 16 | """ |
| 17 | from __future__ import annotations |
| 18 | |
| 19 | import secrets |
| 20 | from datetime import datetime, timezone |
| 21 | |
| 22 | import msgpack |
| 23 | import pytest |
| 24 | from muse.core.types import blob_id |
| 25 | from sqlalchemy import func, select |
| 26 | from sqlalchemy.ext.asyncio import AsyncSession |
| 27 | |
| 28 | from musehub.core.genesis import compute_identity_id, compute_repo_id |
| 29 | from musehub.db.musehub_intel_models import MusehubSymbolHistoryEntry, MusehubSymbolIntel |
| 30 | from musehub.db.musehub_repo_models import MusehubCommit, MusehubCommitRef, MusehubObject, MusehubRepo, MusehubSnapshot, MusehubSnapshotRef |
| 31 | from musehub.types.json_types import StrDict |
| 32 | |
| 33 | |
| 34 | # --------------------------------------------------------------------------- |
| 35 | # Helpers |
| 36 | # --------------------------------------------------------------------------- |
| 37 | |
| 38 | def _now() -> datetime: |
| 39 | return datetime.now(tz=timezone.utc) |
| 40 | |
| 41 | |
| 42 | def _oid(content: bytes) -> str: |
| 43 | return blob_id(content) |
| 44 | |
| 45 | |
| 46 | def _manifest_blob(manifest: StrDict) -> bytes: |
| 47 | return msgpack.packb(manifest, use_bin_type=True) |
| 48 | |
| 49 | |
| 50 | def _commit_id() -> str: |
| 51 | return blob_id(secrets.token_bytes(16)) |
| 52 | |
| 53 | |
| 54 | def _snap_id(manifest: StrDict) -> str: |
| 55 | return blob_id(msgpack.packb(sorted(manifest.items()), use_bin_type=True)) |
| 56 | |
| 57 | |
| 58 | async def _seed_mist_vcs_repo( |
| 59 | session: AsyncSession, |
| 60 | *, |
| 61 | owner: str = "testuser", |
| 62 | artifacts: dict[str, bytes], # filename → raw content bytes |
| 63 | ) -> tuple[MusehubRepo, MusehubCommit]: |
| 64 | """Create a mist repo with a commit pointing at a snapshot of the given artifacts. |
| 65 | |
| 66 | Each artifact becomes a MusehubObject with content_cache populated so |
| 67 | read_object_bytes() can serve it without hitting disk or S3. |
| 68 | """ |
| 69 | owner_id = compute_identity_id(owner.encode()) |
| 70 | slug = f"mist-{secrets.token_hex(4)}" |
| 71 | created_at = _now() |
| 72 | repo_id = compute_repo_id(owner_id, slug, "mist", created_at.isoformat()) |
| 73 | |
| 74 | repo = MusehubRepo( |
| 75 | repo_id=repo_id, |
| 76 | name=slug, |
| 77 | owner=owner, |
| 78 | slug=slug, |
| 79 | visibility="public", |
| 80 | owner_user_id=owner_id, |
| 81 | domain_id="mist", |
| 82 | description="", |
| 83 | tags=[], |
| 84 | created_at=created_at, |
| 85 | ) |
| 86 | session.add(repo) |
| 87 | await session.flush() |
| 88 | |
| 89 | # Create MusehubObject rows with content_cache for each artifact. |
| 90 | manifest: dict[str, str] = {} |
| 91 | for filename, raw in artifacts.items(): |
| 92 | oid = _oid(raw) |
| 93 | manifest[filename] = oid |
| 94 | obj = MusehubObject( |
| 95 | object_id=oid, |
| 96 | path=filename, |
| 97 | size_bytes=len(raw), |
| 98 | content_cache=raw, |
| 99 | ) |
| 100 | # ON CONFLICT DO NOTHING — same bytes may appear in multiple artifacts. |
| 101 | existing = await session.get(MusehubObject, oid) |
| 102 | if existing is None: |
| 103 | session.add(obj) |
| 104 | await session.flush() |
| 105 | |
| 106 | # Create snapshot row. |
| 107 | snap_id = _snap_id(manifest) |
| 108 | existing_snap = await session.get(MusehubSnapshot, snap_id) |
| 109 | if existing_snap is None: |
| 110 | session.add(MusehubSnapshot( |
| 111 | snapshot_id=snap_id, |
| 112 | entry_count=len(manifest), |
| 113 | manifest_blob=_manifest_blob(manifest), |
| 114 | )) |
| 115 | session.add(MusehubSnapshotRef(repo_id=repo_id, snapshot_id=snap_id)) |
| 116 | await session.flush() |
| 117 | |
| 118 | # Create commit row pointing at the snapshot. |
| 119 | cid = _commit_id() |
| 120 | commit = MusehubCommit( |
| 121 | commit_id=cid, |
| 122 | message="initial mist", |
| 123 | author=owner, |
| 124 | branch="main", |
| 125 | parent_ids=[], |
| 126 | snapshot_id=snap_id, |
| 127 | timestamp=_now(), |
| 128 | ) |
| 129 | session.add(commit) |
| 130 | session.add(MusehubCommitRef(repo_id=repo_id, commit_id=cid)) |
| 131 | await session.commit() |
| 132 | await session.refresh(repo) |
| 133 | await session.refresh(commit) |
| 134 | return repo, commit |
| 135 | |
| 136 | |
| 137 | # --------------------------------------------------------------------------- |
| 138 | # 1. build_mist_anchor_index exists and is importable |
| 139 | # --------------------------------------------------------------------------- |
| 140 | |
| 141 | class TestBuildMistAnchorIndexExists: |
| 142 | def test_function_is_importable(self) -> None: |
| 143 | from musehub.services.musehub_mist_indexer import build_mist_anchor_index |
| 144 | import inspect |
| 145 | assert inspect.iscoroutinefunction(build_mist_anchor_index) |
| 146 | |
| 147 | def test_function_signature(self) -> None: |
| 148 | from musehub.services.musehub_mist_indexer import build_mist_anchor_index |
| 149 | import inspect |
| 150 | sig = inspect.signature(build_mist_anchor_index) |
| 151 | assert "repo_id" in sig.parameters |
| 152 | assert "head_commit_id" in sig.parameters |
| 153 | |
| 154 | |
| 155 | # --------------------------------------------------------------------------- |
| 156 | # 2. Anchor extraction → musehub_symbol_history_entries |
| 157 | # --------------------------------------------------------------------------- |
| 158 | |
| 159 | class TestMistAnchorIndexerHistoryEntries: |
| 160 | @pytest.mark.asyncio |
| 161 | async def test_python_artifact_writes_history_entries( |
| 162 | self, db_session: AsyncSession, test_user: db.MusehubIdentity |
| 163 | ) -> None: |
| 164 | from musehub.services.musehub_mist_indexer import build_mist_anchor_index |
| 165 | |
| 166 | repo, commit = await _seed_mist_vcs_repo( |
| 167 | db_session, |
| 168 | owner=test_user.handle, |
| 169 | artifacts={ |
| 170 | "utils.py": b"def add(a, b):\n return a + b\n\ndef sub(a, b):\n return a - b\n", |
| 171 | }, |
| 172 | ) |
| 173 | |
| 174 | await build_mist_anchor_index(db_session, repo.repo_id, commit.commit_id) |
| 175 | await db_session.commit() |
| 176 | |
| 177 | rows = (await db_session.execute( |
| 178 | select(MusehubSymbolHistoryEntry).where( |
| 179 | MusehubSymbolHistoryEntry.repo_id == repo.repo_id, |
| 180 | ) |
| 181 | )).scalars().all() |
| 182 | |
| 183 | addresses = {r.address for r in rows} |
| 184 | assert any("add" in a for a in addresses), f"Expected 'add' anchor; got {addresses}" |
| 185 | assert any("sub" in a for a in addresses), f"Expected 'sub' anchor; got {addresses}" |
| 186 | |
| 187 | @pytest.mark.asyncio |
| 188 | async def test_history_entry_fields( |
| 189 | self, db_session: AsyncSession, test_user: db.MusehubIdentity |
| 190 | ) -> None: |
| 191 | from musehub.services.musehub_mist_indexer import build_mist_anchor_index |
| 192 | |
| 193 | content = b"def process(x):\n return x\n" |
| 194 | repo, commit = await _seed_mist_vcs_repo( |
| 195 | db_session, |
| 196 | owner=test_user.handle, |
| 197 | artifacts={"module.py": content}, |
| 198 | ) |
| 199 | |
| 200 | await build_mist_anchor_index(db_session, repo.repo_id, commit.commit_id) |
| 201 | await db_session.commit() |
| 202 | |
| 203 | row = (await db_session.execute( |
| 204 | select(MusehubSymbolHistoryEntry).where( |
| 205 | MusehubSymbolHistoryEntry.repo_id == repo.repo_id, |
| 206 | MusehubSymbolHistoryEntry.address.like("module.py::%"), |
| 207 | ) |
| 208 | )).scalars().first() |
| 209 | |
| 210 | assert row is not None |
| 211 | assert row.commit_id == commit.commit_id |
| 212 | assert row.author == test_user.handle |
| 213 | assert row.op in ("add", "modify") |
| 214 | assert row.committed_at is not None |
| 215 | |
| 216 | @pytest.mark.asyncio |
| 217 | async def test_multiple_artifacts_all_indexed( |
| 218 | self, db_session: AsyncSession, test_user: db.MusehubIdentity |
| 219 | ) -> None: |
| 220 | from musehub.services.musehub_mist_indexer import build_mist_anchor_index |
| 221 | |
| 222 | repo, commit = await _seed_mist_vcs_repo( |
| 223 | db_session, |
| 224 | owner=test_user.handle, |
| 225 | artifacts={ |
| 226 | "a.py": b"def alpha(): pass\n", |
| 227 | "b.py": b"def beta(): pass\n", |
| 228 | }, |
| 229 | ) |
| 230 | |
| 231 | await build_mist_anchor_index(db_session, repo.repo_id, commit.commit_id) |
| 232 | await db_session.commit() |
| 233 | |
| 234 | rows = (await db_session.execute( |
| 235 | select(MusehubSymbolHistoryEntry).where( |
| 236 | MusehubSymbolHistoryEntry.repo_id == repo.repo_id, |
| 237 | ) |
| 238 | )).scalars().all() |
| 239 | |
| 240 | addresses = {r.address for r in rows} |
| 241 | assert any("alpha" in a for a in addresses) |
| 242 | assert any("beta" in a for a in addresses) |
| 243 | |
| 244 | @pytest.mark.asyncio |
| 245 | async def test_binary_artifact_produces_no_history_entries( |
| 246 | self, db_session: AsyncSession, test_user: db.MusehubIdentity |
| 247 | ) -> None: |
| 248 | from musehub.services.musehub_mist_indexer import build_mist_anchor_index |
| 249 | |
| 250 | repo, commit = await _seed_mist_vcs_repo( |
| 251 | db_session, |
| 252 | owner=test_user.handle, |
| 253 | artifacts={"image.png": b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00"}, |
| 254 | ) |
| 255 | |
| 256 | await build_mist_anchor_index(db_session, repo.repo_id, commit.commit_id) |
| 257 | await db_session.commit() |
| 258 | |
| 259 | count = (await db_session.execute( |
| 260 | select(func.count()).select_from(MusehubSymbolHistoryEntry).where( |
| 261 | MusehubSymbolHistoryEntry.repo_id == repo.repo_id, |
| 262 | ) |
| 263 | )).scalar_one() |
| 264 | |
| 265 | assert count == 0 |
| 266 | |
| 267 | |
| 268 | # --------------------------------------------------------------------------- |
| 269 | # 3. Anchor extraction → musehub_symbol_intel |
| 270 | # --------------------------------------------------------------------------- |
| 271 | |
| 272 | class TestMistAnchorIndexerSymbolIntel: |
| 273 | @pytest.mark.asyncio |
| 274 | async def test_python_artifact_writes_symbol_intel( |
| 275 | self, db_session: AsyncSession, test_user: db.MusehubIdentity |
| 276 | ) -> None: |
| 277 | from musehub.services.musehub_mist_indexer import build_mist_anchor_index |
| 278 | |
| 279 | repo, commit = await _seed_mist_vcs_repo( |
| 280 | db_session, |
| 281 | owner=test_user.handle, |
| 282 | artifacts={"calc.py": b"def mul(a, b):\n return a * b\n"}, |
| 283 | ) |
| 284 | |
| 285 | await build_mist_anchor_index(db_session, repo.repo_id, commit.commit_id) |
| 286 | await db_session.commit() |
| 287 | |
| 288 | rows = (await db_session.execute( |
| 289 | select(MusehubSymbolIntel).where( |
| 290 | MusehubSymbolIntel.repo_id == repo.repo_id, |
| 291 | ) |
| 292 | )).scalars().all() |
| 293 | |
| 294 | assert len(rows) >= 1 |
| 295 | addresses = {r.address for r in rows} |
| 296 | assert any("mul" in a for a in addresses) |
| 297 | |
| 298 | @pytest.mark.asyncio |
| 299 | async def test_symbol_intel_churn_is_at_least_one( |
| 300 | self, db_session: AsyncSession, test_user: db.MusehubIdentity |
| 301 | ) -> None: |
| 302 | from musehub.services.musehub_mist_indexer import build_mist_anchor_index |
| 303 | |
| 304 | repo, commit = await _seed_mist_vcs_repo( |
| 305 | db_session, |
| 306 | owner=test_user.handle, |
| 307 | artifacts={"api.py": b"async def fetch(url):\n pass\n"}, |
| 308 | ) |
| 309 | |
| 310 | await build_mist_anchor_index(db_session, repo.repo_id, commit.commit_id) |
| 311 | await db_session.commit() |
| 312 | |
| 313 | row = (await db_session.execute( |
| 314 | select(MusehubSymbolIntel).where( |
| 315 | MusehubSymbolIntel.repo_id == repo.repo_id, |
| 316 | MusehubSymbolIntel.address.like("api.py::%"), |
| 317 | ) |
| 318 | )).scalars().first() |
| 319 | |
| 320 | assert row is not None |
| 321 | assert row.churn >= 1 |
| 322 | |
| 323 | |
| 324 | # --------------------------------------------------------------------------- |
| 325 | # 4. Idempotency |
| 326 | # --------------------------------------------------------------------------- |
| 327 | |
| 328 | class TestMistAnchorIndexerIdempotency: |
| 329 | @pytest.mark.asyncio |
| 330 | async def test_indexing_same_commit_twice_is_idempotent( |
| 331 | self, db_session: AsyncSession, test_user: db.MusehubIdentity |
| 332 | ) -> None: |
| 333 | from musehub.services.musehub_mist_indexer import build_mist_anchor_index |
| 334 | |
| 335 | repo, commit = await _seed_mist_vcs_repo( |
| 336 | db_session, |
| 337 | owner=test_user.handle, |
| 338 | artifacts={"ops.py": b"def create(): pass\ndef delete(): pass\n"}, |
| 339 | ) |
| 340 | |
| 341 | for _ in range(2): |
| 342 | await build_mist_anchor_index(db_session, repo.repo_id, commit.commit_id) |
| 343 | await db_session.commit() |
| 344 | |
| 345 | history_count = (await db_session.execute( |
| 346 | select(func.count()).select_from(MusehubSymbolHistoryEntry).where( |
| 347 | MusehubSymbolHistoryEntry.repo_id == repo.repo_id, |
| 348 | ) |
| 349 | )).scalar_one() |
| 350 | |
| 351 | intel_count = (await db_session.execute( |
| 352 | select(func.count()).select_from(MusehubSymbolIntel).where( |
| 353 | MusehubSymbolIntel.repo_id == repo.repo_id, |
| 354 | ) |
| 355 | )).scalar_one() |
| 356 | |
| 357 | assert history_count == intel_count, ( |
| 358 | "Each anchor should produce exactly one history entry and one intel row" |
| 359 | ) |
| 360 | # Verify rows are present (not zero from double-delete or something) |
| 361 | assert history_count >= 2, f"Expected ≥2 anchors for create+delete; got {history_count}" |
| 362 | |
| 363 | |
| 364 | # --------------------------------------------------------------------------- |
| 365 | # 5. Edge cases |
| 366 | # --------------------------------------------------------------------------- |
| 367 | |
| 368 | class TestMistAnchorIndexerEdgeCases: |
| 369 | @pytest.mark.asyncio |
| 370 | async def test_commit_without_snapshot_returns_empty( |
| 371 | self, db_session: AsyncSession, test_user: db.MusehubIdentity |
| 372 | ) -> None: |
| 373 | from musehub.services.musehub_mist_indexer import build_mist_anchor_index |
| 374 | from musehub.core.genesis import compute_repo_id |
| 375 | |
| 376 | owner_id = compute_identity_id(test_user.handle.encode()) |
| 377 | created_at = _now() |
| 378 | repo_id = compute_repo_id(owner_id, "no-snap", "mist", created_at.isoformat()) |
| 379 | repo = MusehubRepo( |
| 380 | repo_id=repo_id, name="no-snap", owner=test_user.handle, |
| 381 | slug="no-snap", visibility="public", owner_user_id=owner_id, |
| 382 | domain_id="mist", description="", tags=[], created_at=created_at, |
| 383 | ) |
| 384 | db_session.add(repo) |
| 385 | |
| 386 | cid = _commit_id() |
| 387 | commit = MusehubCommit( |
| 388 | commit_id=cid, message="empty", |
| 389 | author=test_user.handle, branch="main", parent_ids=[], |
| 390 | snapshot_id=None, timestamp=_now(), |
| 391 | ) |
| 392 | db_session.add(commit) |
| 393 | db_session.add(MusehubCommitRef(repo_id=repo_id, commit_id=cid)) |
| 394 | await db_session.commit() |
| 395 | |
| 396 | result = await build_mist_anchor_index(db_session, repo_id, cid) |
| 397 | assert result == [] |
| 398 | |
| 399 | @pytest.mark.asyncio |
| 400 | async def test_object_missing_from_store_is_skipped( |
| 401 | self, db_session: AsyncSession, test_user: db.MusehubIdentity |
| 402 | ) -> None: |
| 403 | """Object_id in manifest but no MusehubObject row → skip gracefully.""" |
| 404 | from musehub.services.musehub_mist_indexer import build_mist_anchor_index |
| 405 | |
| 406 | owner_id = compute_identity_id(test_user.handle.encode()) |
| 407 | created_at = _now() |
| 408 | repo_id = compute_repo_id(owner_id, "ghost-obj", "mist", created_at.isoformat()) |
| 409 | repo = MusehubRepo( |
| 410 | repo_id=repo_id, name="ghost-obj", owner=test_user.handle, |
| 411 | slug="ghost-obj", visibility="public", owner_user_id=owner_id, |
| 412 | domain_id="mist", description="", tags=[], created_at=created_at, |
| 413 | ) |
| 414 | db_session.add(repo) |
| 415 | await db_session.flush() |
| 416 | |
| 417 | ghost_oid = blob_id(b"ghost content that has no DB row") |
| 418 | manifest = {"ghost.py": ghost_oid} |
| 419 | snap_id = _snap_id(manifest) |
| 420 | db_session.add(MusehubSnapshot( |
| 421 | snapshot_id=snap_id, entry_count=1, |
| 422 | manifest_blob=_manifest_blob(manifest), |
| 423 | )) |
| 424 | db_session.add(MusehubSnapshotRef(repo_id=repo_id, snapshot_id=snap_id)) |
| 425 | await db_session.flush() |
| 426 | |
| 427 | cid = _commit_id() |
| 428 | commit = MusehubCommit( |
| 429 | commit_id=cid, message="ghost", |
| 430 | author=test_user.handle, branch="main", parent_ids=[], |
| 431 | snapshot_id=snap_id, timestamp=_now(), |
| 432 | ) |
| 433 | db_session.add(commit) |
| 434 | db_session.add(MusehubCommitRef(repo_id=repo_id, commit_id=cid)) |
| 435 | await db_session.commit() |
| 436 | |
| 437 | # Must not raise — silently skips the missing object. |
| 438 | result = await build_mist_anchor_index(db_session, repo_id, cid) |
| 439 | assert isinstance(result, list) |
| 440 | |
| 441 | @pytest.mark.asyncio |
| 442 | async def test_returns_intel_result_tuple( |
| 443 | self, db_session: AsyncSession, test_user: db.MusehubIdentity |
| 444 | ) -> None: |
| 445 | from musehub.services.musehub_mist_indexer import build_mist_anchor_index |
| 446 | |
| 447 | repo, commit = await _seed_mist_vcs_repo( |
| 448 | db_session, |
| 449 | owner=test_user.handle, |
| 450 | artifacts={"result.py": b"def answer(): return 42\n"}, |
| 451 | ) |
| 452 | |
| 453 | result = await build_mist_anchor_index(db_session, repo.repo_id, commit.commit_id) |
| 454 | |
| 455 | assert len(result) == 1 |
| 456 | intel_type, data = result[0] |
| 457 | assert intel_type == "mist.anchor_index" |
| 458 | assert "anchor_count" in data |
| 459 | assert data["anchor_count"] >= 1 |
| 460 | |
| 461 | |
| 462 | # --------------------------------------------------------------------------- |
| 463 | # 6. MistProvider delegates to build_mist_anchor_index |
| 464 | # --------------------------------------------------------------------------- |
| 465 | |
| 466 | class TestMistProviderDelegatesToIndexer: |
| 467 | @pytest.mark.asyncio |
| 468 | async def test_mist_provider_writes_normalized_rows( |
| 469 | self, db_session: AsyncSession, test_user: db.MusehubIdentity |
| 470 | ) -> None: |
| 471 | """MistProvider.compute triggers the normalized indexer for VCS-backed mists.""" |
| 472 | from musehub.services.musehub_intel_providers import _PROVIDER_REGISTRY |
| 473 | |
| 474 | provider = _PROVIDER_REGISTRY["intel.mist"] |
| 475 | repo, commit = await _seed_mist_vcs_repo( |
| 476 | db_session, |
| 477 | owner=test_user.handle, |
| 478 | artifacts={"svc.py": b"class Service:\n def run(self): pass\n"}, |
| 479 | ) |
| 480 | |
| 481 | await provider.compute(db_session, repo.repo_id, commit.commit_id, {}) |
| 482 | await db_session.commit() |
| 483 | |
| 484 | rows = (await db_session.execute( |
| 485 | select(MusehubSymbolHistoryEntry).where( |
| 486 | MusehubSymbolHistoryEntry.repo_id == repo.repo_id, |
| 487 | ) |
| 488 | )).scalars().all() |
| 489 | |
| 490 | assert len(rows) >= 1, "MistProvider must write normalized symbol history entries" |