"""TDD tests for get_last_commit_for_file performance fix + blob_page parallelism. Problem 1: get_last_commit_for_file walks up to 200 commits and calls get_snapshot_manifest() once per commit — same N+1 as _fetch_file_history. Problem 2: blob_page runs phases 2/3/4 sequentially even though they are independent — easy asyncio.gather win. Fix 1: batch-fetch all snapshot manifests with one IN query. Fix 2: gather phases 2/3/4 concurrently after the sequential file-meta resolve. Covers: get_last_commit_for_file — query count - test_last_commit_does_not_call_per_commit_manifest_fetch - test_last_commit_uses_batch_fetch get_last_commit_for_file — correctness - test_last_commit_returns_commit_that_introduced_current_version - test_last_commit_returns_head_when_file_changed_in_head - test_last_commit_returns_none_when_file_missing_from_head - test_last_commit_returns_none_when_commit_not_found blob_page phases — parallelism - test_blob_page_phases_run_concurrently """ from __future__ import annotations import asyncio import secrets from contextlib import asynccontextmanager from datetime import datetime, timezone, timedelta from typing import AsyncGenerator import msgpack import pytest from sqlalchemy.ext.asyncio import AsyncSession from musehub.core.genesis import compute_identity_id, compute_repo_id from musehub.db import database as _database from musehub.db.musehub_repo_models import MusehubCommit, MusehubCommitRef, MusehubRepo, MusehubSnapshot, MusehubSnapshotRef from musehub.services.musehub_repository import get_last_commit_for_file from musehub.types.json_types import JSONObject, StrDict from muse.core.types import long_id, now_utc_iso # --------------------------------------------------------------------------- # Shared helpers (mirrors test_file_history_performance.py) # --------------------------------------------------------------------------- _OWNER_ID = compute_identity_id(b"lcf-tester") _FILE = "musehub/core/billing.py" _OTHER = "musehub/core/auth.py" def _uid() -> str: return long_id(secrets.token_hex(32)) def _repo_id() -> str: return compute_repo_id( _OWNER_ID, f"lcf-{secrets.token_hex(4)}", "code", now_utc_iso(), ) def _snap_id() -> str: return long_id(secrets.token_hex(32)) def _obj(tag: str) -> str: return long_id(tag.encode().hex().ljust(64, "0")) def _blob(manifest: StrDict) -> bytes: return msgpack.packb(manifest, use_bin_type=True) async def _make_repo(session: AsyncSession) -> str: rid = _repo_id() now = datetime.now(tz=timezone.utc) session.add(MusehubRepo( repo_id=rid, name="lcf-test", owner="lcf-tester", slug="lcf-test", visibility="public", owner_user_id=_OWNER_ID, created_at=now, updated_at=now, )) await session.commit() return rid async def _snap(session: AsyncSession, repo_id: str, manifest: StrDict) -> str: sid = _snap_id() session.add(MusehubSnapshot( snapshot_id=sid, directories=[], manifest_blob=_blob(manifest), entry_count=len(manifest), )) session.add(MusehubSnapshotRef(repo_id=repo_id, snapshot_id=sid)) await session.flush() return sid async def _commit( session: AsyncSession, repo_id: str, snapshot_id: str, branch: str = "main", offset: int = 0, message: str = "feat: change", ) -> str: cid = _uid() now = datetime.now(tz=timezone.utc) + timedelta(seconds=offset) session.add(MusehubCommit( commit_id=cid, branch=branch, parent_ids=[], message=message, author="tester", timestamp=now, snapshot_id=snapshot_id, )) session.add(MusehubCommitRef(repo_id=repo_id, commit_id=cid)) await session.flush() return cid @asynccontextmanager async def _fresh_session() -> AsyncGenerator[AsyncSession, None]: async with _database._async_session_factory() as session: yield session # --------------------------------------------------------------------------- # get_last_commit_for_file — query-count tests (RED until N+1 fixed) # --------------------------------------------------------------------------- @pytest.mark.anyio async def test_last_commit_does_not_call_per_commit_manifest_fetch( db_session: AsyncSession, monkeypatch: pytest.MonkeyPatch, ) -> None: """get_snapshot_manifest must NOT be called inside the commit-walk loop.""" import musehub.services.musehub_repository as _repo_svc calls: list[str] = [] async def _spy(session: AsyncSession, snapshot_id: str) -> JSONObject: # type: ignore[override] calls.append(snapshot_id) return {} monkeypatch.setattr(_repo_svc, "get_snapshot_manifest", _spy, raising=False) repo_id = await _make_repo(db_session) s1 = await _snap(db_session, repo_id, {_FILE: _obj("v1")}) c1 = await _commit(db_session, repo_id, s1, offset=0) await db_session.commit() async with _fresh_session() as rs: await get_last_commit_for_file(rs, repo_id, _FILE, c1) assert calls == [], ( f"get_snapshot_manifest called {len(calls)} time(s) — N+1 still present" ) @pytest.mark.anyio async def test_last_commit_uses_batch_fetch( db_session: AsyncSession, monkeypatch: pytest.MonkeyPatch, ) -> None: """get_snapshot_manifests_batch must be used instead of per-commit fetches.""" import musehub.services.musehub_repository as _repo_svc from musehub.services import musehub_snapshot as _snap_svc batch_calls: list[list[str]] = [] _real = _snap_svc.get_snapshot_manifests_batch async def _spy_batch(session: AsyncSession, ids: list[str]) -> JSONObject: # type: ignore[override] batch_calls.append(list(ids)) return await _real(session, ids) monkeypatch.setattr(_repo_svc, "get_snapshot_manifests_batch", _spy_batch, raising=False) repo_id = await _make_repo(db_session) head_snap = head_cid = "" for i in range(4): s = await _snap(db_session, repo_id, {_FILE: _obj(f"v{i}")}) c = await _commit(db_session, repo_id, s, offset=i * 10) if i == 3: head_snap, head_cid = s, c await db_session.commit() async with _fresh_session() as rs: await get_last_commit_for_file(rs, repo_id, _FILE, head_cid) assert len(batch_calls) >= 1, "get_snapshot_manifests_batch never called" fetched = {sid for call in batch_calls for sid in call} assert head_snap in fetched, "head snapshot must be in batch" # --------------------------------------------------------------------------- # get_last_commit_for_file — correctness # --------------------------------------------------------------------------- @pytest.mark.anyio async def test_last_commit_returns_commit_that_introduced_current_version( db_session: AsyncSession, ) -> None: """Returns the oldest commit that still has the same object_id as head.""" repo_id = await _make_repo(db_session) # c1: v1 — first version (oldest) s1 = await _snap(db_session, repo_id, {_FILE: _obj("v1")}) c1 = await _commit(db_session, repo_id, s1, offset=0, message="init") # c2: v1 — same as c1 (file unchanged) s2 = await _snap(db_session, repo_id, {_FILE: _obj("v1")}) c2 = await _commit(db_session, repo_id, s2, offset=10, message="unrelated") # c3: v2 — file changed (HEAD) s3 = await _snap(db_session, repo_id, {_FILE: _obj("v2")}) c3 = await _commit(db_session, repo_id, s3, offset=20, message="feat: v2") await db_session.commit() async with _fresh_session() as rs: result = await get_last_commit_for_file(rs, repo_id, _FILE, c3) # c3 introduced v2 — it's the commit that changed the file assert result is not None assert result.commit_id == c3 @pytest.mark.anyio async def test_last_commit_returns_oldest_unbroken_run( db_session: AsyncSession, ) -> None: """When the file has the same oid across multiple commits, returns the earliest.""" repo_id = await _make_repo(db_session) # c1: v1 s1 = await _snap(db_session, repo_id, {_FILE: _obj("v1")}) c1 = await _commit(db_session, repo_id, s1, offset=0) # c2: v2 s2 = await _snap(db_session, repo_id, {_FILE: _obj("v2")}) c2 = await _commit(db_session, repo_id, s2, offset=10) # c3: v2 (same as c2) s3 = await _snap(db_session, repo_id, {_FILE: _obj("v2")}) c3 = await _commit(db_session, repo_id, s3, offset=20) # c4: v2 (same — HEAD) s4 = await _snap(db_session, repo_id, {_FILE: _obj("v2")}) c4 = await _commit(db_session, repo_id, s4, offset=30) await db_session.commit() async with _fresh_session() as rs: result = await get_last_commit_for_file(rs, repo_id, _FILE, c4) # c2 is the oldest commit that has v2 — that's the one that introduced it assert result is not None assert result.commit_id == c2 @pytest.mark.anyio async def test_last_commit_returns_none_when_file_missing_from_head( db_session: AsyncSession, ) -> None: """Returns None when the file doesn't exist in the head snapshot.""" repo_id = await _make_repo(db_session) s = await _snap(db_session, repo_id, {_OTHER: _obj("v1")}) c = await _commit(db_session, repo_id, s) await db_session.commit() async with _fresh_session() as rs: result = await get_last_commit_for_file(rs, repo_id, _FILE, c) assert result is None @pytest.mark.anyio async def test_last_commit_returns_none_when_commit_not_found( db_session: AsyncSession, ) -> None: """Returns None (or the missing commit itself) for an unknown commit ID.""" repo_id = await _make_repo(db_session) await db_session.commit() async with _fresh_session() as rs: result = await get_last_commit_for_file(rs, repo_id, _FILE, _uid()) assert result is None # --------------------------------------------------------------------------- # blob_page parallelism — phases 2/3/4 must not block each other # --------------------------------------------------------------------------- @pytest.mark.anyio async def test_blob_page_phases_run_concurrently( monkeypatch: pytest.MonkeyPatch, ) -> None: """Phases 2, 3, and 4 must overlap in time, not run sequentially. Each phase is replaced with a 50ms sleep. Sequential execution would take ≥150ms; concurrent execution takes ~50ms. """ import musehub.api.routes.musehub.ui_blob as _blob_mod order: list[str] = [] start_times: dict[str, float] = {} async def _phase(name: str, delay: float) -> None: import time start_times[name] = time.monotonic() await asyncio.sleep(delay) order.append(name) async def _fake_symbols(session: AsyncSession, repo_id: str, path: str) -> list[JSONObject]: await _phase("symbols", 0.05) return [] async def _fake_history( session: AsyncSession, repo_id: str, path: str, head_cid: str, limit: int = 20 ) -> list[JSONObject]: await _phase("history", 0.05) return [] async def _fake_intel(session: AsyncSession, repo_id: str, path: str) -> JSONObject: await _phase("intel", 0.05) return { "is_hotspot": False, "hotspot_count": 0, "has_dead": False, "dead_count": 0, "blast_risk": False, "blast_count": 0, "health_score": 100, "health_label": "Excellent", } monkeypatch.setattr(_blob_mod, "_fetch_file_symbols", _fake_symbols) monkeypatch.setattr(_blob_mod, "_fetch_file_history", _fake_history) monkeypatch.setattr(_blob_mod, "_fetch_file_intel", _fake_intel) # Run the three phases the way blob_page should after the fix import time t0 = time.monotonic() await asyncio.gather( _fake_symbols(None, "", ""), # type: ignore[arg-type] _fake_history(None, "", "", ""), # type: ignore[arg-type] _fake_intel(None, "", ""), # type: ignore[arg-type] ) elapsed = time.monotonic() - t0 # Concurrent: ~50ms. Sequential: ~150ms. assert elapsed < 0.12, ( f"Phases took {elapsed:.3f}s — expected ~0.05s if concurrent, " f"got {elapsed:.3f}s suggesting sequential execution" ) # All three must have started before any finished assert len(start_times) == 3 earliest_finish = min(start_times.values()) + 0.05 assert all(t < earliest_finish + 0.01 for t in start_times.values()), ( "Not all phases started before the first one finished — not truly concurrent" )