""" Tests for materialized per-file last-commit data. FLC1 — get_file_last_commits returns empty dict when table has no rows for repo FLC2 — compute_and_store_file_last_commits populates table from commit history FLC3 — get_file_last_commits reads from table (single query, no blob decode) FLC4 — file changed in newer commit → attributed to newer commit FLC5 — file unchanged since first commit → attributed to oldest commit FLC6 — directory path returns the commit of its most-recently-changed file FLC7 — compute is idempotent: running twice gives same result, no duplicates FLC8 — only paths requested are returned (no extra rows leaked) FLC9 — unknown paths return no entry (no crash) FLC10 — second push updates attribution for files that changed FLC11 — more than 100 commits does not raise (batch chunking) """ from __future__ import annotations import hashlib from collections.abc import Mapping from datetime import datetime, timezone, timedelta import pytest from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy import select from musehub.db.musehub_intel_models import MusehubFileLastCommit from musehub.db.musehub_repo_models import MusehubCommit, MusehubCommitRef, MusehubSnapshot, MusehubSnapshotRef from tests.factories import create_repo, create_branch # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _utc(offset_days: int = 0) -> datetime: return datetime.now(tz=timezone.utc) + timedelta(days=offset_days) def _snap_id(name: str) -> str: return "sha256:" + hashlib.sha256(name.encode()).hexdigest() def _commit_id(name: str) -> str: return "sha256:" + hashlib.sha256(f"commit:{name}".encode()).hexdigest() def _obj_id(name: str) -> str: return "sha256:" + hashlib.sha256(f"obj:{name}".encode()).hexdigest() async def _add_snapshot(session: AsyncSession, repo_id: str, snap_name: str, manifest: Mapping[str, str]) -> str: """Store a snapshot with manifest blob.""" import msgpack snap_id = _snap_id(snap_name) existing = await session.get(MusehubSnapshot, snap_id) if existing is None: session.add(MusehubSnapshot( snapshot_id=snap_id, directories=[], manifest_blob=msgpack.packb(manifest, use_bin_type=True), entry_count=len(manifest), created_at=_utc(), )) session.add(MusehubSnapshotRef(repo_id=repo_id, snapshot_id=snap_id)) await session.flush() return snap_id async def _add_commit( session: AsyncSession, repo_id: str, name: str, snap_name: str, manifest: dict[str, str], branch: str = "main", ts_offset: int = 0, agent_id: str = "", message: str = "", parent: "MusehubCommit | None" = None, ) -> MusehubCommit: snap_id = await _add_snapshot(session, repo_id, snap_name, manifest) commit = MusehubCommit( commit_id=_commit_id(name), branch=branch, parent_ids=[parent.commit_id] if parent else [], message=message or f"commit {name}", author="gabriel", timestamp=_utc(ts_offset), snapshot_id=snap_id, agent_id=agent_id, ) session.add(commit) session.add(MusehubCommitRef(repo_id=repo_id, commit_id=commit.commit_id)) await session.flush() return commit # --------------------------------------------------------------------------- # FLC1 — empty table → empty result # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_flc1_empty_table_returns_empty(db_session: AsyncSession) -> None: """FLC1: no rows in table → empty dict, no crash.""" from musehub.services.musehub_repository import get_file_last_commits repo = await create_repo(db_session) result = await get_file_last_commits(db_session, repo.repo_id, ["README.md"], ref="main") assert result == {} # --------------------------------------------------------------------------- # FLC2 — compute populates table # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_flc2_compute_populates_table(db_session: AsyncSession) -> None: """FLC2: compute_and_store_file_last_commits writes rows to musehub_file_last_commits.""" from musehub.services.file_last_commits import compute_and_store_file_last_commits repo = await create_repo(db_session) branch = await create_branch(db_session, repo.repo_id, name="main") manifest = {"README.md": _obj_id("readme"), "src/app.py": _obj_id("app")} commit = await _add_commit(db_session, repo.repo_id, "c1", "s1", manifest) branch.head_commit_id = commit.commit_id await db_session.flush() await compute_and_store_file_last_commits(db_session, repo.repo_id, "main", commit.commit_id) await db_session.flush() rows = (await db_session.execute( select(MusehubFileLastCommit).where( MusehubFileLastCommit.repo_id == repo.repo_id, MusehubFileLastCommit.branch == "main", ) )).scalars().all() paths = {r.path for r in rows} assert "README.md" in paths assert "src/app.py" in paths # --------------------------------------------------------------------------- # FLC3 — get_file_last_commits reads from table # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_flc3_reads_from_table(db_session: AsyncSession) -> None: """FLC3: after compute, get_file_last_commits returns data without blob decode.""" from musehub.services.file_last_commits import compute_and_store_file_last_commits from musehub.services.musehub_repository import get_file_last_commits repo = await create_repo(db_session) branch = await create_branch(db_session, repo.repo_id, name="main") manifest = {"README.md": _obj_id("readme-v1")} commit = await _add_commit(db_session, repo.repo_id, "c1", "s1", manifest, message="feat: init") branch.head_commit_id = commit.commit_id await db_session.flush() await compute_and_store_file_last_commits(db_session, repo.repo_id, "main", commit.commit_id) await db_session.flush() result = await get_file_last_commits(db_session, repo.repo_id, ["README.md"], ref="main") assert "README.md" in result assert result["README.md"]["sha"] == commit.commit_id assert result["README.md"]["message"] == "feat: init" # --------------------------------------------------------------------------- # FLC4 — changed file attributed to newer commit # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_flc4_changed_file_attributed_to_newer_commit(db_session: AsyncSession) -> None: """FLC4: file that changed in commit 2 is attributed to commit 2, not commit 1.""" from musehub.services.file_last_commits import compute_and_store_file_last_commits from musehub.services.musehub_repository import get_file_last_commits repo = await create_repo(db_session) branch = await create_branch(db_session, repo.repo_id, name="main") manifest_v1 = {"README.md": _obj_id("readme-v1"), "src/app.py": _obj_id("app-v1")} manifest_v2 = {"README.md": _obj_id("readme-v2"), "src/app.py": _obj_id("app-v1")} c1 = await _add_commit(db_session, repo.repo_id, "c1", "s1", manifest_v1, ts_offset=-1) c2 = await _add_commit(db_session, repo.repo_id, "c2", "s2", manifest_v2, ts_offset=0, parent=c1) branch.head_commit_id = c2.commit_id await db_session.flush() await compute_and_store_file_last_commits(db_session, repo.repo_id, "main", c2.commit_id) await db_session.flush() result = await get_file_last_commits(db_session, repo.repo_id, ["README.md", "src/app.py"], ref="main") assert result["README.md"]["sha"] == c2.commit_id assert result["src/app.py"]["sha"] == c1.commit_id # --------------------------------------------------------------------------- # FLC5 — unchanged file attributed to first commit # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_flc5_unchanged_file_attributed_to_oldest_commit(db_session: AsyncSession) -> None: """FLC5: file never changed is attributed to the oldest commit in the walk.""" from musehub.services.file_last_commits import compute_and_store_file_last_commits from musehub.services.musehub_repository import get_file_last_commits repo = await create_repo(db_session) branch = await create_branch(db_session, repo.repo_id, name="main") oid = _obj_id("stable") c1 = await _add_commit(db_session, repo.repo_id, "c1", "s1", {"stable.py": oid}, ts_offset=-2) c2 = await _add_commit(db_session, repo.repo_id, "c2", "s2", {"stable.py": oid}, ts_offset=-1, parent=c1) c3 = await _add_commit(db_session, repo.repo_id, "c3", "s3", {"stable.py": oid}, ts_offset=0, parent=c2) branch.head_commit_id = c3.commit_id await db_session.flush() await compute_and_store_file_last_commits(db_session, repo.repo_id, "main", c3.commit_id) await db_session.flush() result = await get_file_last_commits(db_session, repo.repo_id, ["stable.py"], ref="main") assert result["stable.py"]["sha"] == c1.commit_id # --------------------------------------------------------------------------- # FLC6 — directory path → most-recently-changed file in dir # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_flc6_directory_attributed_to_most_recent_child_commit(db_session: AsyncSession) -> None: """FLC6: directory path resolves to the commit that last touched any file inside it.""" from musehub.services.file_last_commits import compute_and_store_file_last_commits from musehub.services.musehub_repository import get_file_last_commits repo = await create_repo(db_session) branch = await create_branch(db_session, repo.repo_id, name="main") m1 = {"src/a.py": _obj_id("a-v1"), "src/b.py": _obj_id("b-v1")} m2 = {"src/a.py": _obj_id("a-v1"), "src/b.py": _obj_id("b-v2")} c1 = await _add_commit(db_session, repo.repo_id, "c1", "s1", m1, ts_offset=-1) c2 = await _add_commit(db_session, repo.repo_id, "c2", "s2", m2, ts_offset=0) branch.head_commit_id = c2.commit_id await db_session.flush() await compute_and_store_file_last_commits(db_session, repo.repo_id, "main", c2.commit_id) await db_session.flush() result = await get_file_last_commits(db_session, repo.repo_id, ["src"], ref="main") assert result["src"]["sha"] == c2.commit_id # --------------------------------------------------------------------------- # FLC7 — idempotent # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_flc7_compute_is_idempotent(db_session: AsyncSession) -> None: """FLC7: running compute twice yields same result, no duplicate rows.""" from musehub.services.file_last_commits import compute_and_store_file_last_commits repo = await create_repo(db_session) branch = await create_branch(db_session, repo.repo_id, name="main") commit = await _add_commit(db_session, repo.repo_id, "c1", "s1", {"a.py": _obj_id("a")}) branch.head_commit_id = commit.commit_id await db_session.flush() await compute_and_store_file_last_commits(db_session, repo.repo_id, "main", commit.commit_id) await compute_and_store_file_last_commits(db_session, repo.repo_id, "main", commit.commit_id) await db_session.flush() rows = (await db_session.execute( select(MusehubFileLastCommit).where( MusehubFileLastCommit.repo_id == repo.repo_id, MusehubFileLastCommit.branch == "main", MusehubFileLastCommit.path == "a.py", ) )).scalars().all() assert len(rows) == 1 # --------------------------------------------------------------------------- # FLC8 — only requested paths returned # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_flc8_only_requested_paths_returned(db_session: AsyncSession) -> None: """FLC8: get_file_last_commits returns only the paths asked for.""" from musehub.services.file_last_commits import compute_and_store_file_last_commits from musehub.services.musehub_repository import get_file_last_commits repo = await create_repo(db_session) branch = await create_branch(db_session, repo.repo_id, name="main") manifest = {"a.py": _obj_id("a"), "b.py": _obj_id("b"), "c.py": _obj_id("c")} commit = await _add_commit(db_session, repo.repo_id, "c1", "s1", manifest) branch.head_commit_id = commit.commit_id await db_session.flush() await compute_and_store_file_last_commits(db_session, repo.repo_id, "main", commit.commit_id) await db_session.flush() result = await get_file_last_commits(db_session, repo.repo_id, ["a.py"], ref="main") assert set(result.keys()) == {"a.py"} # --------------------------------------------------------------------------- # FLC9 — unknown paths return no entry # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_flc9_unknown_paths_not_in_result(db_session: AsyncSession) -> None: """FLC9: paths not in any snapshot are silently absent from result.""" from musehub.services.file_last_commits import compute_and_store_file_last_commits from musehub.services.musehub_repository import get_file_last_commits repo = await create_repo(db_session) branch = await create_branch(db_session, repo.repo_id, name="main") commit = await _add_commit(db_session, repo.repo_id, "c1", "s1", {"real.py": _obj_id("r")}) branch.head_commit_id = commit.commit_id await db_session.flush() await compute_and_store_file_last_commits(db_session, repo.repo_id, "main", commit.commit_id) await db_session.flush() result = await get_file_last_commits(db_session, repo.repo_id, ["ghost.py"], ref="main") assert "ghost.py" not in result # --------------------------------------------------------------------------- # FLC10 — second push updates changed files # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_flc10_second_push_updates_changed_files(db_session: AsyncSession) -> None: """FLC10: after a second push, files that changed point to the new commit.""" from musehub.services.file_last_commits import compute_and_store_file_last_commits from musehub.services.musehub_repository import get_file_last_commits repo = await create_repo(db_session) branch = await create_branch(db_session, repo.repo_id, name="main") m1 = {"README.md": _obj_id("readme-v1")} m2 = {"README.md": _obj_id("readme-v2")} c1 = await _add_commit(db_session, repo.repo_id, "c1", "s1", m1, ts_offset=-1) branch.head_commit_id = c1.commit_id await db_session.flush() await compute_and_store_file_last_commits(db_session, repo.repo_id, "main", c1.commit_id) await db_session.flush() c2 = await _add_commit(db_session, repo.repo_id, "c2", "s2", m2, ts_offset=0) branch.head_commit_id = c2.commit_id await db_session.flush() await compute_and_store_file_last_commits(db_session, repo.repo_id, "main", c2.commit_id) await db_session.flush() result = await get_file_last_commits(db_session, repo.repo_id, ["README.md"], ref="main") assert result["README.md"]["sha"] == c2.commit_id # --------------------------------------------------------------------------- # FLC11 — more than 100 commits does not raise # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_flc11_over_100_commits_does_not_raise(db_session: AsyncSession) -> None: """FLC11: repos with >100 commits must not raise ValueError from batch limit.""" from musehub.services.file_last_commits import compute_and_store_file_last_commits from musehub.services.musehub_repository import get_file_last_commits repo = await create_repo(db_session) branch = await create_branch(db_session, repo.repo_id, name="main") n = 150 last_commit = None for i in range(n): manifest = {"src/app.py": _obj_id(f"app-v{i}")} last_commit = await _add_commit( db_session, repo.repo_id, f"c{i}", f"s{i}", manifest, ts_offset=i - n ) branch.head_commit_id = last_commit.commit_id # type: ignore[union-attr] await db_session.flush() # Must not raise ValueError: batch size N exceeds limit 100 await compute_and_store_file_last_commits( db_session, repo.repo_id, "main", last_commit.commit_id # type: ignore[union-attr] ) await db_session.flush() result = await get_file_last_commits(db_session, repo.repo_id, ["src/app.py"], ref="main") assert "src/app.py" in result