"""Tests for the missing-snapshot integrity invariant in pack building. Root cause ---------- ``build_pack_from_walk`` silently skips a snapshot when its file is absent, but still includes the commit that references it in the pack mpack. The remote then receives a commit record pointing to a snapshot_id it will never have — a dangling reference that silently corrupts the remote's history. Invariant being enforced ------------------------ Every commit in a push mpack MUST have its snapshot present in the local store. If any snapshot file is missing, ``build_pack_from_walk`` raises ``ValueError`` ("Push aborted") rather than sending a commit with a dangling snapshot reference. Behaviour: * ``walk_commits`` detects missing snapshots and reports them in ``missing_snapshots``; a WARNING is emitted for each. * ``build_pack_from_walk`` raises ``ValueError`` if ``missing_snapshots`` is non-empty — no partial mpack is ever returned. These tests drive the implementation in ``muse/core/pack.py``. """ from __future__ import annotations import datetime import hashlib import pathlib import pytest from muse.core.types import Manifest, blob_id from muse.core.object_store import write_object type _FileBytes = dict[str, bytes] from muse.core.mpack import MPack as PackBundle, build_mpack_from_walk as build_pack_from_walk, walk_commits from muse.core.ids import hash_commit as compute_commit_id, hash_snapshot as compute_snapshot_id from muse.core.commits import ( CommitRecord, write_commit, ) from muse.core.snapshots import ( SnapshotRecord, write_snapshot, ) from muse.core.paths import ref_path, muse_dir # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- _REPO_ID = "integrity-test" def _init_repo(root: pathlib.Path) -> None: import json as _json dot_muse = muse_dir(root) for d in ("commits", "snapshots", "objects", "refs/heads"): (dot_muse / d).mkdir(parents=True, exist_ok=True) (dot_muse / "HEAD").write_text("ref: refs/heads/main", encoding="utf-8") (dot_muse / "repo.json").write_text( _json.dumps({"repo_id": _REPO_ID, "domain": "code"}), encoding="utf-8" ) def _make_commit( root: pathlib.Path, files: _FileBytes, message: str, parent_id: str | None = None, branch: str = "main", write_snap: bool = True, ) -> CommitRecord: """Create a commit, optionally skipping snapshot write to simulate corruption.""" manifest = {} for path, content in files.items(): oid = blob_id(content) write_object(root, oid, content) manifest[path] = oid snap_id = compute_snapshot_id(manifest) now = datetime.datetime.now(datetime.timezone.utc) commit_id = compute_commit_id( parent_ids=[parent_id] if parent_id else [], snapshot_id=snap_id, message=message, committed_at_iso=now.isoformat(), ) if write_snap: write_snapshot(root, SnapshotRecord(snapshot_id=snap_id, manifest=manifest)) record = CommitRecord( commit_id=commit_id, branch=branch, snapshot_id=snap_id, message=message, committed_at=now, parent_commit_id=parent_id, ) write_commit(root, record) (ref_path(root, branch)).write_text(commit_id, encoding="utf-8") return record # --------------------------------------------------------------------------- # I — walk_commits exposes missing_snapshots # --------------------------------------------------------------------------- class TestWalkCommitsMissingSnapshotDetection: """walk_commits must report commits whose snapshot files are absent.""" def test_walk_commits_no_missing_snapshots_when_all_present( self, tmp_path: pathlib.Path ) -> None: _init_repo(tmp_path) c = _make_commit(tmp_path, {"a.py": b"x"}, "first", write_snap=True) result = walk_commits(tmp_path, [c.commit_id]) assert not result["missing_snapshots"], ( "No snapshots are missing — missing_snapshots should be empty" ) def test_walk_commits_detects_single_missing_snapshot( self, tmp_path: pathlib.Path ) -> None: _init_repo(tmp_path) c1 = _make_commit(tmp_path, {"a.py": b"v1"}, "first", write_snap=True) # Second commit: snapshot file deliberately not written c2 = _make_commit(tmp_path, {"a.py": b"v2"}, "second", parent_id=c1.commit_id, write_snap=False) result = walk_commits(tmp_path, [c2.commit_id]) assert c2.snapshot_id in result["missing_snapshots"], ( "walk_commits must expose the missing snapshot_id" ) def test_walk_commits_detects_multiple_missing_snapshots_in_chain( self, tmp_path: pathlib.Path ) -> None: _init_repo(tmp_path) c1 = _make_commit(tmp_path, {"f.py": b"v1"}, "A", write_snap=True) c2 = _make_commit(tmp_path, {"f.py": b"v2"}, "B", parent_id=c1.commit_id, write_snap=False) c3 = _make_commit(tmp_path, {"f.py": b"v3"}, "C", parent_id=c2.commit_id, write_snap=False) c4 = _make_commit(tmp_path, {"f.py": b"v4"}, "D", parent_id=c3.commit_id, write_snap=True) result = walk_commits(tmp_path, [c4.commit_id]) assert c2.snapshot_id in result["missing_snapshots"] assert c3.snapshot_id in result["missing_snapshots"] assert c1.snapshot_id not in result["missing_snapshots"] assert c4.snapshot_id not in result["missing_snapshots"] def test_walk_commits_missing_snapshots_not_in_have_are_excluded( self, tmp_path: pathlib.Path ) -> None: """Commits in the have-set are never walked so their snapshots don't matter.""" _init_repo(tmp_path) c1 = _make_commit(tmp_path, {"f.py": b"v1"}, "A", write_snap=False) c2 = _make_commit(tmp_path, {"f.py": b"v2"}, "B", parent_id=c1.commit_id, write_snap=True) # c1 is in have — BFS stops before it; its missing snapshot is irrelevant. result = walk_commits(tmp_path, [c2.commit_id], have=[c1.commit_id]) assert not result["missing_snapshots"], ( "Commits in have are not walked — their snapshots should not be flagged" ) # --------------------------------------------------------------------------- # II — build_pack_from_walk raises when missing snapshots are present # --------------------------------------------------------------------------- class TestBuildPackExcludesCommitsWithMissingSnapshot: """build_pack_from_walk must raise ValueError when any snapshot is absent. Silently skipping would push commits without their snapshots, creating dangling references on the remote that can never be healed without rewriting history. The strict raise forces the caller to either repair the store (``muse verify``) or exclude the broken commits before pushing. """ def test_pack_raises_when_snapshot_missing( self, tmp_path: pathlib.Path ) -> None: _init_repo(tmp_path) c1 = _make_commit(tmp_path, {"a.py": b"v1"}, "good", write_snap=True) c2 = _make_commit(tmp_path, {"a.py": b"v2"}, "broken", parent_id=c1.commit_id, write_snap=False) walk = walk_commits(tmp_path, [c2.commit_id]) with pytest.raises(ValueError, match="Push aborted"): build_pack_from_walk(tmp_path, walk) def test_pack_includes_commit_when_snapshot_present( self, tmp_path: pathlib.Path ) -> None: _init_repo(tmp_path) c1 = _make_commit(tmp_path, {"a.py": b"v1"}, "good", write_snap=True) walk = walk_commits(tmp_path, [c1.commit_id]) mpack = build_pack_from_walk(tmp_path, walk) commit_ids_in_pack = {c["commit_id"] for c in mpack["commits"]} assert c1.commit_id in commit_ids_in_pack def test_pack_raises_when_any_snapshot_missing_in_chain( self, tmp_path: pathlib.Path ) -> None: """A single missing snapshot in a chain aborts the entire pack.""" _init_repo(tmp_path) c1 = _make_commit(tmp_path, {"f.py": b"v1"}, "A", write_snap=True) c2 = _make_commit(tmp_path, {"f.py": b"v2"}, "B", parent_id=c1.commit_id, write_snap=False) c3 = _make_commit(tmp_path, {"f.py": b"v3"}, "C", parent_id=c2.commit_id, write_snap=True) walk = walk_commits(tmp_path, [c3.commit_id]) with pytest.raises(ValueError, match="Push aborted"): build_pack_from_walk(tmp_path, walk) def test_pack_bundle_snapshot_list_and_commit_list_are_consistent( self, tmp_path: pathlib.Path ) -> None: """Every snapshot_id referenced by a commit in the mpack must be present in mpack['snapshots'] — verified on a fully intact chain.""" _init_repo(tmp_path) c1 = _make_commit(tmp_path, {"a.py": b"v1"}, "A", write_snap=True) c2 = _make_commit(tmp_path, {"a.py": b"v2"}, "B", parent_id=c1.commit_id, write_snap=True) c3 = _make_commit(tmp_path, {"a.py": b"v3"}, "C", parent_id=c2.commit_id, write_snap=True) walk = walk_commits(tmp_path, [c3.commit_id]) mpack = build_pack_from_walk(tmp_path, walk) snap_ids_in_bundle = {s["snapshot_id"] for s in mpack["snapshots"]} for commit_dict in mpack["commits"]: sid = commit_dict["snapshot_id"] assert sid in snap_ids_in_bundle, ( f"Commit {commit_dict['commit_id'][:8]} references snapshot " f"{sid[:8]} which is not in the mpack — dangling reference" ) def test_no_warning_when_all_snapshots_present( self, tmp_path: pathlib.Path, caplog: pytest.LogCaptureFixture ) -> None: _init_repo(tmp_path) c = _make_commit(tmp_path, {"x.py": b"ok"}, "clean", write_snap=True) import logging with caplog.at_level(logging.WARNING, logger="muse.core.mpack"): walk = walk_commits(tmp_path, [c.commit_id]) build_pack_from_walk(tmp_path, walk) assert "not found" not in caplog.text def test_warning_emitted_when_snapshot_missing( self, tmp_path: pathlib.Path, caplog: pytest.LogCaptureFixture ) -> None: _init_repo(tmp_path) c = _make_commit(tmp_path, {"x.py": b"broken"}, "oops", write_snap=False) import logging with caplog.at_level(logging.WARNING, logger="muse.core.mpack"): walk = walk_commits(tmp_path, [c.commit_id]) with pytest.raises(ValueError, match="Push aborted"): build_pack_from_walk(tmp_path, walk) assert c.snapshot_id[:8] in caplog.text # --------------------------------------------------------------------------- # III — regression: the real muse repo's 3 broken commits # --------------------------------------------------------------------------- class TestMissingSnapshotRegressionInvariant: """Verify the invariant holds end-to-end: every reachable commit in a repo that we attempt to push must have its snapshot present — build_pack_from_walk raises ValueError rather than sending a commit with a dangling snapshot ref.""" def test_pack_aborts_on_chain_with_gaps( self, tmp_path: pathlib.Path ) -> None: """A chain with missing snapshots raises ValueError, not a partial mpack.""" _init_repo(tmp_path) # Build: A(good) → B(broken) → C(broken) → D(good) c_a = _make_commit(tmp_path, {"f": b"a"}, "A", write_snap=True) c_b = _make_commit(tmp_path, {"f": b"b"}, "B", parent_id=c_a.commit_id, write_snap=False) c_c = _make_commit(tmp_path, {"f": b"c"}, "C", parent_id=c_b.commit_id, write_snap=False) c_d = _make_commit(tmp_path, {"f": b"d"}, "D", parent_id=c_c.commit_id, write_snap=True) walk = walk_commits(tmp_path, [c_d.commit_id]) with pytest.raises(ValueError, match="Push aborted"): build_pack_from_walk(tmp_path, walk) def test_reachable_commits_with_missing_snapshots_are_reported( self, tmp_path: pathlib.Path ) -> None: """walk_commits must expose all missing snapshot_ids so callers can surface the issue before attempting a push.""" _init_repo(tmp_path) c1 = _make_commit(tmp_path, {"f": b"1"}, "root", write_snap=True) c2 = _make_commit(tmp_path, {"f": b"2"}, "broken-1", parent_id=c1.commit_id, write_snap=False) c3 = _make_commit(tmp_path, {"f": b"3"}, "broken-2", parent_id=c2.commit_id, write_snap=False) c4 = _make_commit(tmp_path, {"f": b"4"}, "broken-3", parent_id=c3.commit_id, write_snap=False) c5 = _make_commit(tmp_path, {"f": b"5"}, "good", parent_id=c4.commit_id, write_snap=True) result = walk_commits(tmp_path, [c5.commit_id]) missing = result["missing_snapshots"] assert c2.snapshot_id in missing assert c3.snapshot_id in missing assert c4.snapshot_id in missing assert c1.snapshot_id not in missing assert c5.snapshot_id not in missing