"""Tests for Bug 10: GC deletes objects reachable from a corrupt commit. When a commit file has a corrupt snapshot_id field (bit-flip, tampered, or previously written by the now-fixed from_dict timestamp substitution bug), _collect_reachable_objects returns an empty set for that commit's snapshot. muse gc then deletes those objects, permanently destroying file content. The sequence of doom: 1. Commit A on disk: snapshot_id = "f"*64 (corrupt — should be "S1") 2. `get_all_commits` returns this commit (no hash verification) 3. `read_snapshot(root, "f"*64)` → None (file for "f"*64 doesn't exist) 4. Objects from the REAL snapshot S1 are NOT added to reachable 5. `muse gc` deletes those objects (no other commit references them) 6. The working tree cannot be reconstructed from commit A Fix: `_collect_reachable_objects` must also scan snapshot files directly (without hash verification) when read_snapshot returns None for a commit's snapshot_id. This conservatively retains any object referenced in any snapshot manifest on disk, regardless of whether the commit's snapshot_id field is correct. Scope of tests -------------- Unit (_collect_reachable_objects): - Objects reachable from a valid commit are retained - Objects reachable from a commit with corrupt snapshot_id (field points to non-existent snapshot) are still retained via raw snapshot scan - Objects reachable from a commit with corrupt snapshot_id (field points to a WRONG existing snapshot) are still retained via raw snapshot scan - GC does NOT delete objects that are in ANY snapshot on disk, even orphaned - Empty store returns empty reachable set (no crash) - Multiple commits, one corrupt: all objects from all snapshots retained Integration (run_gc with corrupt commit): - run_gc dry_run=True does not delete anything from a store with corrupt commit - run_gc does not delete objects from corrupt commit's snapshot (default mode) - run_gc --full does not delete objects from corrupt commit's snapshot - Objects from a legitimately unreachable commit ARE deleted (GC still works) Stress: - 50 commits, 5 with corrupt snapshot_ids: all objects from all 50 snapshots retained """ from __future__ import annotations type _FileStore = dict[str, bytes] import datetime import hashlib import pathlib import pytest from muse.core.gc import _collect_reachable_objects, run_gc from muse.core.paths import commits_dir, heads_dir, muse_dir, snapshots_dir from muse.core.types import NULL_COMMIT_ID from muse.core.object_store import write_object from muse.core.ids import hash_commit as compute_commit_id, hash_snapshot as compute_snapshot_id from muse.core.commits import ( CommitRecord, write_commit, ) from muse.core.snapshots import ( SnapshotRecord, read_snapshot, write_snapshot, ) _TS = datetime.datetime(2024, 6, 15, 10, 0, 0, tzinfo=datetime.timezone.utc) def _make_repo(tmp_path: pathlib.Path) -> pathlib.Path: repo = tmp_path / "repo" repo.mkdir() muse_dir(repo).mkdir() return repo def _write_object(repo: pathlib.Path, content: bytes) -> str: from muse.core.types import blob_id oid = blob_id(content) write_object(repo, oid, content) return oid def _make_snapshot(repo: pathlib.Path, files: _FileStore) -> SnapshotRecord: manifest = {} for path, content in files.items(): oid = _write_object(repo, content) manifest[path] = oid snap_id = compute_snapshot_id(manifest) snap = SnapshotRecord( snapshot_id=snap_id, manifest=manifest, directories=[], created_at=_TS, note="", ) write_snapshot(repo, snap) return snap def _make_commit( repo: pathlib.Path, snap_id: str, *, message: str = "test", ts: datetime.datetime = _TS, parent: str | None = None, ) -> CommitRecord: parent_ids = [parent] if parent else [] commit_id = compute_commit_id( parent_ids=parent_ids, snapshot_id=snap_id, message=message, committed_at_iso=ts.isoformat(), author="gabriel", ) record = CommitRecord( commit_id=commit_id, branch="main", snapshot_id=snap_id, message=message, committed_at=ts, parent_commit_id=parent, parent2_commit_id=None, author="gabriel", metadata={}, structured_delta=None, sem_ver_bump="none", breaking_changes=[], agent_id="", model_id="", toolchain_id="", prompt_hash="", signature="", signer_key_id="", reviewed_by=[], test_runs=0, ) write_commit(repo, record) return record def _corrupt_commit_snapshot_id( repo: pathlib.Path, commit_id: str, bad_snapshot_id: str = "f" * 64 ) -> None: """Directly corrupt the snapshot_id field in a commit object on disk.""" import json as _json from muse.core.object_store import object_path as _object_path from muse.core.types import long_id as _long_id path = _object_path(repo, _long_id(commit_id)) raw = path.read_bytes() null_idx = raw.index(b"\0") data = _json.loads(raw[null_idx + 1:]) data["snapshot_id"] = bad_snapshot_id payload = _json.dumps(data, separators=(",", ":")).encode() path.write_bytes(f"commit {len(payload)}\0".encode() + payload) # ────────────────────────────────────────────────────────────────────────────── # Unit: _collect_reachable_objects # ────────────────────────────────────────────────────────────────────────────── class TestCollectReachableObjects: def test_valid_commit_objects_retained(self, tmp_path: pathlib.Path) -> None: repo = _make_repo(tmp_path) snap = _make_snapshot(repo, {"src/a.py": b"content_a"}) _make_commit(repo, snap.snapshot_id) reachable = _collect_reachable_objects(repo) for oid in snap.manifest.values(): assert oid in reachable, f"Object {oid[:8]} should be reachable" def test_corrupt_snapshot_id_objects_still_retained(self, tmp_path: pathlib.Path) -> None: """BUG: When commit's snapshot_id is corrupt, objects are not retained.""" repo = _make_repo(tmp_path) snap = _make_snapshot(repo, {"src/main.py": b"important data"}) commit = _make_commit(repo, snap.snapshot_id) # Corrupt the snapshot_id to a non-existent value _corrupt_commit_snapshot_id(repo, commit.commit_id, "f" * 64) # Verify that read_snapshot now fails for the commit stored = _collect_reachable_objects.__module__ # just to check we're testing the right thing reachable = _collect_reachable_objects(repo) for oid in snap.manifest.values(): assert oid in reachable, ( f"DATA LOSS: Object {oid[:8]} from snapshot {snap.snapshot_id[:8]} " f"was NOT retained by GC after the commit's snapshot_id was corrupted. " f"Running muse gc would delete this object permanently." ) def test_corrupt_snapshot_id_points_to_wrong_existing_snapshot(self, tmp_path: pathlib.Path) -> None: """Even worse: corrupt snapshot_id points to a DIFFERENT existing snapshot. The objects from the ORIGINAL snapshot are still not retained. """ repo = _make_repo(tmp_path) snap1 = _make_snapshot(repo, {"src/file1.py": b"content 1"}) snap2 = _make_snapshot(repo, {"src/file2.py": b"content 2"}) commit = _make_commit(repo, snap1.snapshot_id) # Corrupt: now points to snap2's ID instead of snap1's _corrupt_commit_snapshot_id(repo, commit.commit_id, snap2.snapshot_id) reachable = _collect_reachable_objects(repo) for oid in snap1.manifest.values(): assert oid in reachable, ( f"DATA LOSS: Object from snap1 ({oid[:8]}) not retained — " f"corrupt snapshot_id pointed to snap2 instead, and snap1's " f"objects were not retained. GC would delete them." ) def test_two_commits_one_corrupt_all_objects_retained(self, tmp_path: pathlib.Path) -> None: """One corrupt commit must not prevent the other commit's objects from being retained.""" repo = _make_repo(tmp_path) snap1 = _make_snapshot(repo, {"a.py": b"aaa"}) snap2 = _make_snapshot(repo, {"b.py": b"bbb"}) commit1 = _make_commit(repo, snap1.snapshot_id, message="c1") _make_commit(repo, snap2.snapshot_id, message="c2") _corrupt_commit_snapshot_id(repo, commit1.commit_id, NULL_COMMIT_ID) reachable = _collect_reachable_objects(repo) # Both snapshots' objects must be retained for oid in snap1.manifest.values(): assert oid in reachable, f"snap1 object {oid[:8]} not retained after corruption" for oid in snap2.manifest.values(): assert oid in reachable, f"snap2 object {oid[:8]} not retained" def test_empty_store_no_crash(self, tmp_path: pathlib.Path) -> None: repo = _make_repo(tmp_path) reachable = _collect_reachable_objects(repo) assert reachable == set() def test_valid_commit_chain_all_retained(self, tmp_path: pathlib.Path) -> None: repo = _make_repo(tmp_path) snap1 = _make_snapshot(repo, {"a.py": b"v1"}) snap2 = _make_snapshot(repo, {"a.py": b"v2"}) c1 = _make_commit(repo, snap1.snapshot_id, message="v1") _make_commit(repo, snap2.snapshot_id, message="v2", parent=c1.commit_id) reachable = _collect_reachable_objects(repo) for oid in snap1.manifest.values(): assert oid in reachable for oid in snap2.manifest.values(): assert oid in reachable # ────────────────────────────────────────────────────────────────────────────── # Integration: run_gc with corrupt commit # ────────────────────────────────────────────────────────────────────────────── class TestRunGcCorruptCommit: def test_gc_dry_run_reports_no_collected_objects_for_corrupt_commit(self, tmp_path: pathlib.Path) -> None: """Dry run must show 0 objects to collect when all objects are reachable.""" repo = _make_repo(tmp_path) snap = _make_snapshot(repo, {"main.py": b"code"}) commit = _make_commit(repo, snap.snapshot_id) _corrupt_commit_snapshot_id(repo, commit.commit_id) result = run_gc(repo, dry_run=True, grace_period_seconds=0) assert result.collected_count == 0, ( f"BUG: dry_run GC reports {result.collected_count} objects to collect, " f"but all objects are reachable (just via a corrupt commit). " f"Running without dry_run would permanently delete these objects." ) def test_gc_does_not_delete_objects_from_corrupt_commit(self, tmp_path: pathlib.Path) -> None: """Objects from a corrupt commit's snapshot must survive GC.""" repo = _make_repo(tmp_path) snap = _make_snapshot(repo, {"main.py": b"valuable content"}) commit = _make_commit(repo, snap.snapshot_id) _corrupt_commit_snapshot_id(repo, commit.commit_id) result = run_gc(repo, dry_run=False, grace_period_seconds=0) assert result.collected_count == 0, ( f"DATA LOSS: GC deleted {result.collected_count} object(s) that were " f"reachable from a commit with a corrupt snapshot_id. Those objects " f"are now permanently gone." ) # Verify the object is still on disk oid = list(snap.manifest.values())[0] from muse.core.object_store import read_object assert read_object(repo, oid) is not None, ( f"CONFIRMED DATA LOSS: Object {oid[:8]} was deleted by GC." ) def test_gc_full_retains_objects_when_corrupt_snapshot_file_exists(self, tmp_path: pathlib.Path) -> None: """GC --full must retain objects when a snapshot FILE exists at the correct path but its stored snapshot_id field doesn't match the computed hash. The commit references snap_id S1; the file at S1 has a corrupt snapshot_id field (not the manifest) so _verify_snapshot_id fails. Our raw-fallback path must read the manifest directly and retain its object IDs. """ repo = _make_repo(tmp_path) snap = _make_snapshot(repo, {"main.py": b"important"}) commit = _make_commit(repo, snap.snapshot_id) # Point the branch ref at the commit (making it reachable) h_dir = heads_dir(repo) h_dir.mkdir(parents=True, exist_ok=True) (h_dir / "main").write_text(commit.commit_id) # Corrupt the snapshot object's stored snapshot_id field (NOT the manifest). # The manifest still has the correct object IDs. # _verify_snapshot_id will fail (recomputed hash != stored snapshot_id field), # causing read_snapshot to return None — but the object IDs in the manifest # are still valid and should be retained by our raw-fallback path. import json as _json from muse.core.object_store import object_path as _object_path snap_obj_path = _object_path(repo, snap.snapshot_id) raw = snap_obj_path.read_bytes() null_idx = raw.index(b"\0") snap_data = _json.loads(raw[null_idx + 1:]) oid = list(snap_data["manifest"].values())[0] # save the real object ID snap_data["snapshot_id"] = f"corrupt_id_{'0' * 53}" # corrupt the stored ID field payload = _json.dumps(snap_data, separators=(",", ":")).encode() snap_obj_path.write_bytes(f"snapshot {len(payload)}\0".encode() + payload) result = run_gc(repo, dry_run=False, grace_period_seconds=0, full=True) from muse.core.object_store import read_object obj = read_object(repo, oid) assert obj is not None, ( f"DATA LOSS: GC --full deleted object {oid[:8]} that was referenced " f"in a corrupt snapshot file (corrupt stored snapshot_id field, " f"valid manifest). The raw-fallback path must have retained it." ) def test_gc_full_corrupt_commit_snapshot_id_no_file_no_crash(self, tmp_path: pathlib.Path) -> None: """Document the known edge case: if a commit's snapshot_id is corrupt AND the referenced file doesn't exist, GC --full cannot retain those objects. Users must run `muse verify-pack` before `muse gc --full` in this scenario. This test verifies no crash occurs (the behavior is a known limitation). """ repo = _make_repo(tmp_path) snap = _make_snapshot(repo, {"main.py": b"important"}) commit = _make_commit(repo, snap.snapshot_id) h_dir = heads_dir(repo) h_dir.mkdir(parents=True, exist_ok=True) (h_dir / "main").write_text(commit.commit_id) # Corrupt the commit so its snapshot_id points to a non-existent file _corrupt_commit_snapshot_id(repo, commit.commit_id, "f" * 64) # Known limitation: when commit.snapshot_id points to a non-existent file, # GC --full cannot determine which objects to retain. No crash must occur. result = run_gc(repo, dry_run=False, grace_period_seconds=0, full=True) # No assertion about objects — this is the documented limitation. def test_gc_still_collects_truly_orphaned_objects(self, tmp_path: pathlib.Path) -> None: """Regression: GC must still delete truly unreachable objects.""" repo = _make_repo(tmp_path) # Write an object that is NOT in any snapshot orphan_content = b"orphaned content - no snapshot references this" orphan_oid = _write_object(repo, orphan_content) # Write a valid commit with a snapshot that does NOT reference the orphan snap = _make_snapshot(repo, {"other.py": b"other content"}) _make_commit(repo, snap.snapshot_id) result = run_gc(repo, dry_run=False, grace_period_seconds=0) assert orphan_oid in result.collected_ids, ( f"Orphaned object {orphan_oid[:8]} was not collected by GC. " f"GC is too conservative." ) # ────────────────────────────────────────────────────────────────────────────── # Stress # ────────────────────────────────────────────────────────────────────────────── class TestGcCorruptStress: def test_50_commits_5_corrupt_all_objects_retained(self, tmp_path: pathlib.Path) -> None: """50 commits, 5 with corrupt snapshot_ids: all objects retained, no crash.""" repo = _make_repo(tmp_path) commit_records = [] all_oids: set[str] = set() for i in range(50): content = f"content_{i}".encode() snap = _make_snapshot(repo, {f"f{i}.py": content}) ts = _TS + datetime.timedelta(seconds=i) commit = _make_commit(repo, snap.snapshot_id, message=f"commit {i}", ts=ts) commit_records.append(commit) all_oids.update(snap.manifest.values()) # Corrupt 5 commits (indices 10, 20, 30, 40, 49) corrupt_indices = {10, 20, 30, 40, 49} for idx in corrupt_indices: _corrupt_commit_snapshot_id( repo, commit_records[idx].commit_id, "9" * 64 ) reachable = _collect_reachable_objects(repo) missing = [oid for oid in all_oids if oid not in reachable] assert not missing, ( f"DATA LOSS: {len(missing)} object(s) not retained by GC despite " f"being reachable from snapshots on disk. " f"Missing: {[o[:8] for o in missing[:5]]}" )