gabriel / muse public
test_gc_corrupt_commit_object_retention.py python
405 lines 18.1 KB
Raw
sha256:81ae324db5ad375fbfe4834c6fcb378312cafad3cc92dec5d3e5c427306621a2 fix: remove commit_exists filter from have anchors — server… Sonnet 4.6 patch 21 days ago
1 """Tests for Bug 10: GC deletes objects reachable from a corrupt commit.
2
3 When a commit file has a corrupt snapshot_id field (bit-flip, tampered, or
4 previously written by the now-fixed from_dict timestamp substitution bug),
5 _collect_reachable_objects returns an empty set for that commit's snapshot.
6 muse gc then deletes those objects, permanently destroying file content.
7
8 The sequence of doom:
9 1. Commit A on disk: snapshot_id = "f"*64 (corrupt — should be "S1")
10 2. `get_all_commits` returns this commit (no hash verification)
11 3. `read_snapshot(root, "f"*64)` → None (file for "f"*64 doesn't exist)
12 4. Objects from the REAL snapshot S1 are NOT added to reachable
13 5. `muse gc` deletes those objects (no other commit references them)
14 6. The working tree cannot be reconstructed from commit A
15
16 Fix: `_collect_reachable_objects` must also scan snapshot files directly
17 (without hash verification) when read_snapshot returns None for a commit's
18 snapshot_id. This conservatively retains any object referenced in any
19 snapshot manifest on disk, regardless of whether the commit's snapshot_id
20 field is correct.
21
22 Scope of tests
23 --------------
24 Unit (_collect_reachable_objects):
25 - Objects reachable from a valid commit are retained
26 - Objects reachable from a commit with corrupt snapshot_id (field points
27 to non-existent snapshot) are still retained via raw snapshot scan
28 - Objects reachable from a commit with corrupt snapshot_id (field points
29 to a WRONG existing snapshot) are still retained via raw snapshot scan
30 - GC does NOT delete objects that are in ANY snapshot on disk, even orphaned
31 - Empty store returns empty reachable set (no crash)
32 - Multiple commits, one corrupt: all objects from all snapshots retained
33
34 Integration (run_gc with corrupt commit):
35 - run_gc dry_run=True does not delete anything from a store with corrupt commit
36 - run_gc does not delete objects from corrupt commit's snapshot (default mode)
37 - run_gc --full does not delete objects from corrupt commit's snapshot
38 - Objects from a legitimately unreachable commit ARE deleted (GC still works)
39
40 Stress:
41 - 50 commits, 5 with corrupt snapshot_ids: all objects from all 50 snapshots retained
42 """
43 from __future__ import annotations
44
45 type _FileStore = dict[str, bytes]
46
47 import datetime
48 import hashlib
49 import pathlib
50
51 import pytest
52
53 from muse.core.gc import _collect_reachable_objects, run_gc
54 from muse.core.paths import commits_dir, heads_dir, muse_dir, snapshots_dir
55 from muse.core.types import NULL_COMMIT_ID
56 from muse.core.object_store import write_object
57 from muse.core.ids import hash_commit as compute_commit_id, hash_snapshot as compute_snapshot_id
58 from muse.core.commits import (
59 CommitRecord,
60 write_commit,
61 )
62 from muse.core.snapshots import (
63 SnapshotRecord,
64 read_snapshot,
65 write_snapshot,
66 )
67
68 _TS = datetime.datetime(2024, 6, 15, 10, 0, 0, tzinfo=datetime.timezone.utc)
69
70
71 def _make_repo(tmp_path: pathlib.Path) -> pathlib.Path:
72 repo = tmp_path / "repo"
73 repo.mkdir()
74 muse_dir(repo).mkdir()
75 return repo
76
77
78 def _write_object(repo: pathlib.Path, content: bytes) -> str:
79 from muse.core.types import blob_id
80 oid = blob_id(content)
81 write_object(repo, oid, content)
82 return oid
83
84
85 def _make_snapshot(repo: pathlib.Path, files: _FileStore) -> SnapshotRecord:
86 manifest = {}
87 for path, content in files.items():
88 oid = _write_object(repo, content)
89 manifest[path] = oid
90 snap_id = compute_snapshot_id(manifest)
91 snap = SnapshotRecord(
92 snapshot_id=snap_id,
93 manifest=manifest,
94 directories=[],
95 created_at=_TS,
96 note="",
97 )
98 write_snapshot(repo, snap)
99 return snap
100
101
102 def _make_commit(
103 repo: pathlib.Path,
104 snap_id: str,
105 *,
106 message: str = "test",
107 ts: datetime.datetime = _TS,
108 parent: str | None = None,
109 ) -> CommitRecord:
110 parent_ids = [parent] if parent else []
111 commit_id = compute_commit_id(
112 parent_ids=parent_ids,
113 snapshot_id=snap_id,
114 message=message,
115 committed_at_iso=ts.isoformat(),
116 author="gabriel",
117 )
118 record = CommitRecord(
119 commit_id=commit_id,
120 branch="main",
121 snapshot_id=snap_id,
122 message=message,
123 committed_at=ts,
124 parent_commit_id=parent,
125 parent2_commit_id=None,
126 author="gabriel",
127 metadata={},
128 structured_delta=None,
129 sem_ver_bump="none",
130 breaking_changes=[],
131 agent_id="",
132 model_id="",
133 toolchain_id="",
134 prompt_hash="",
135 signature="",
136 signer_key_id="",
137 reviewed_by=[],
138 test_runs=0,
139 )
140 write_commit(repo, record)
141 return record
142
143
144 def _corrupt_commit_snapshot_id(
145 repo: pathlib.Path, commit_id: str, bad_snapshot_id: str = "f" * 64
146 ) -> None:
147 """Directly corrupt the snapshot_id field in a commit object on disk."""
148 import json as _json
149 from muse.core.object_store import object_path as _object_path
150 from muse.core.types import long_id as _long_id
151 path = _object_path(repo, _long_id(commit_id))
152 raw = path.read_bytes()
153 null_idx = raw.index(b"\0")
154 data = _json.loads(raw[null_idx + 1:])
155 data["snapshot_id"] = bad_snapshot_id
156 payload = _json.dumps(data, separators=(",", ":")).encode()
157 path.write_bytes(f"commit {len(payload)}\0".encode() + payload)
158
159
160 # ──────────────────────────────────────────────────────────────────────────────
161 # Unit: _collect_reachable_objects
162 # ──────────────────────────────────────────────────────────────────────────────
163
164 class TestCollectReachableObjects:
165
166 def test_valid_commit_objects_retained(self, tmp_path: pathlib.Path) -> None:
167 repo = _make_repo(tmp_path)
168 snap = _make_snapshot(repo, {"src/a.py": b"content_a"})
169 _make_commit(repo, snap.snapshot_id)
170 reachable = _collect_reachable_objects(repo)
171 for oid in snap.manifest.values():
172 assert oid in reachable, f"Object {oid[:8]} should be reachable"
173
174 def test_corrupt_snapshot_id_objects_still_retained(self, tmp_path: pathlib.Path) -> None:
175 """BUG: When commit's snapshot_id is corrupt, objects are not retained."""
176 repo = _make_repo(tmp_path)
177 snap = _make_snapshot(repo, {"src/main.py": b"important data"})
178 commit = _make_commit(repo, snap.snapshot_id)
179
180 # Corrupt the snapshot_id to a non-existent value
181 _corrupt_commit_snapshot_id(repo, commit.commit_id, "f" * 64)
182
183 # Verify that read_snapshot now fails for the commit
184 stored = _collect_reachable_objects.__module__ # just to check we're testing the right thing
185
186 reachable = _collect_reachable_objects(repo)
187 for oid in snap.manifest.values():
188 assert oid in reachable, (
189 f"DATA LOSS: Object {oid[:8]} from snapshot {snap.snapshot_id[:8]} "
190 f"was NOT retained by GC after the commit's snapshot_id was corrupted. "
191 f"Running muse gc would delete this object permanently."
192 )
193
194 def test_corrupt_snapshot_id_points_to_wrong_existing_snapshot(self, tmp_path: pathlib.Path) -> None:
195 """Even worse: corrupt snapshot_id points to a DIFFERENT existing snapshot.
196 The objects from the ORIGINAL snapshot are still not retained.
197 """
198 repo = _make_repo(tmp_path)
199 snap1 = _make_snapshot(repo, {"src/file1.py": b"content 1"})
200 snap2 = _make_snapshot(repo, {"src/file2.py": b"content 2"})
201 commit = _make_commit(repo, snap1.snapshot_id)
202
203 # Corrupt: now points to snap2's ID instead of snap1's
204 _corrupt_commit_snapshot_id(repo, commit.commit_id, snap2.snapshot_id)
205
206 reachable = _collect_reachable_objects(repo)
207 for oid in snap1.manifest.values():
208 assert oid in reachable, (
209 f"DATA LOSS: Object from snap1 ({oid[:8]}) not retained — "
210 f"corrupt snapshot_id pointed to snap2 instead, and snap1's "
211 f"objects were not retained. GC would delete them."
212 )
213
214 def test_two_commits_one_corrupt_all_objects_retained(self, tmp_path: pathlib.Path) -> None:
215 """One corrupt commit must not prevent the other commit's objects from being retained."""
216 repo = _make_repo(tmp_path)
217 snap1 = _make_snapshot(repo, {"a.py": b"aaa"})
218 snap2 = _make_snapshot(repo, {"b.py": b"bbb"})
219 commit1 = _make_commit(repo, snap1.snapshot_id, message="c1")
220 _make_commit(repo, snap2.snapshot_id, message="c2")
221
222 _corrupt_commit_snapshot_id(repo, commit1.commit_id, NULL_COMMIT_ID)
223
224 reachable = _collect_reachable_objects(repo)
225 # Both snapshots' objects must be retained
226 for oid in snap1.manifest.values():
227 assert oid in reachable, f"snap1 object {oid[:8]} not retained after corruption"
228 for oid in snap2.manifest.values():
229 assert oid in reachable, f"snap2 object {oid[:8]} not retained"
230
231 def test_empty_store_no_crash(self, tmp_path: pathlib.Path) -> None:
232 repo = _make_repo(tmp_path)
233 reachable = _collect_reachable_objects(repo)
234 assert reachable == set()
235
236 def test_valid_commit_chain_all_retained(self, tmp_path: pathlib.Path) -> None:
237 repo = _make_repo(tmp_path)
238 snap1 = _make_snapshot(repo, {"a.py": b"v1"})
239 snap2 = _make_snapshot(repo, {"a.py": b"v2"})
240 c1 = _make_commit(repo, snap1.snapshot_id, message="v1")
241 _make_commit(repo, snap2.snapshot_id, message="v2", parent=c1.commit_id)
242
243 reachable = _collect_reachable_objects(repo)
244 for oid in snap1.manifest.values():
245 assert oid in reachable
246 for oid in snap2.manifest.values():
247 assert oid in reachable
248
249
250 # ──────────────────────────────────────────────────────────────────────────────
251 # Integration: run_gc with corrupt commit
252 # ──────────────────────────────────────────────────────────────────────────────
253
254 class TestRunGcCorruptCommit:
255
256 def test_gc_dry_run_reports_no_collected_objects_for_corrupt_commit(self, tmp_path: pathlib.Path) -> None:
257 """Dry run must show 0 objects to collect when all objects are reachable."""
258 repo = _make_repo(tmp_path)
259 snap = _make_snapshot(repo, {"main.py": b"code"})
260 commit = _make_commit(repo, snap.snapshot_id)
261 _corrupt_commit_snapshot_id(repo, commit.commit_id)
262
263 result = run_gc(repo, dry_run=True, grace_period_seconds=0)
264 assert result.collected_count == 0, (
265 f"BUG: dry_run GC reports {result.collected_count} objects to collect, "
266 f"but all objects are reachable (just via a corrupt commit). "
267 f"Running without dry_run would permanently delete these objects."
268 )
269
270 def test_gc_does_not_delete_objects_from_corrupt_commit(self, tmp_path: pathlib.Path) -> None:
271 """Objects from a corrupt commit's snapshot must survive GC."""
272 repo = _make_repo(tmp_path)
273 snap = _make_snapshot(repo, {"main.py": b"valuable content"})
274 commit = _make_commit(repo, snap.snapshot_id)
275 _corrupt_commit_snapshot_id(repo, commit.commit_id)
276
277 result = run_gc(repo, dry_run=False, grace_period_seconds=0)
278 assert result.collected_count == 0, (
279 f"DATA LOSS: GC deleted {result.collected_count} object(s) that were "
280 f"reachable from a commit with a corrupt snapshot_id. Those objects "
281 f"are now permanently gone."
282 )
283 # Verify the object is still on disk
284 oid = list(snap.manifest.values())[0]
285 from muse.core.object_store import read_object
286 assert read_object(repo, oid) is not None, (
287 f"CONFIRMED DATA LOSS: Object {oid[:8]} was deleted by GC."
288 )
289
290 def test_gc_full_retains_objects_when_corrupt_snapshot_file_exists(self, tmp_path: pathlib.Path) -> None:
291 """GC --full must retain objects when a snapshot FILE exists at the correct
292 path but its stored snapshot_id field doesn't match the computed hash.
293 The commit references snap_id S1; the file at S1 has a corrupt snapshot_id
294 field (not the manifest) so _verify_snapshot_id fails. Our raw-fallback
295 path must read the manifest directly and retain its object IDs.
296 """
297 repo = _make_repo(tmp_path)
298 snap = _make_snapshot(repo, {"main.py": b"important"})
299 commit = _make_commit(repo, snap.snapshot_id)
300
301 # Point the branch ref at the commit (making it reachable)
302 h_dir = heads_dir(repo)
303 h_dir.mkdir(parents=True, exist_ok=True)
304 (h_dir / "main").write_text(commit.commit_id)
305
306 # Corrupt the snapshot object's stored snapshot_id field (NOT the manifest).
307 # The manifest still has the correct object IDs.
308 # _verify_snapshot_id will fail (recomputed hash != stored snapshot_id field),
309 # causing read_snapshot to return None — but the object IDs in the manifest
310 # are still valid and should be retained by our raw-fallback path.
311 import json as _json
312 from muse.core.object_store import object_path as _object_path
313 snap_obj_path = _object_path(repo, snap.snapshot_id)
314 raw = snap_obj_path.read_bytes()
315 null_idx = raw.index(b"\0")
316 snap_data = _json.loads(raw[null_idx + 1:])
317 oid = list(snap_data["manifest"].values())[0] # save the real object ID
318 snap_data["snapshot_id"] = f"corrupt_id_{'0' * 53}" # corrupt the stored ID field
319 payload = _json.dumps(snap_data, separators=(",", ":")).encode()
320 snap_obj_path.write_bytes(f"snapshot {len(payload)}\0".encode() + payload)
321
322 result = run_gc(repo, dry_run=False, grace_period_seconds=0, full=True)
323 from muse.core.object_store import read_object
324 obj = read_object(repo, oid)
325 assert obj is not None, (
326 f"DATA LOSS: GC --full deleted object {oid[:8]} that was referenced "
327 f"in a corrupt snapshot file (corrupt stored snapshot_id field, "
328 f"valid manifest). The raw-fallback path must have retained it."
329 )
330
331 def test_gc_full_corrupt_commit_snapshot_id_no_file_no_crash(self, tmp_path: pathlib.Path) -> None:
332 """Document the known edge case: if a commit's snapshot_id is corrupt AND
333 the referenced file doesn't exist, GC --full cannot retain those objects.
334 Users must run `muse verify-pack` before `muse gc --full` in this scenario.
335 This test verifies no crash occurs (the behavior is a known limitation).
336 """
337 repo = _make_repo(tmp_path)
338 snap = _make_snapshot(repo, {"main.py": b"important"})
339 commit = _make_commit(repo, snap.snapshot_id)
340
341 h_dir = heads_dir(repo)
342 h_dir.mkdir(parents=True, exist_ok=True)
343 (h_dir / "main").write_text(commit.commit_id)
344
345 # Corrupt the commit so its snapshot_id points to a non-existent file
346 _corrupt_commit_snapshot_id(repo, commit.commit_id, "f" * 64)
347
348 # Known limitation: when commit.snapshot_id points to a non-existent file,
349 # GC --full cannot determine which objects to retain. No crash must occur.
350 result = run_gc(repo, dry_run=False, grace_period_seconds=0, full=True)
351 # No assertion about objects — this is the documented limitation.
352
353 def test_gc_still_collects_truly_orphaned_objects(self, tmp_path: pathlib.Path) -> None:
354 """Regression: GC must still delete truly unreachable objects."""
355 repo = _make_repo(tmp_path)
356 # Write an object that is NOT in any snapshot
357 orphan_content = b"orphaned content - no snapshot references this"
358 orphan_oid = _write_object(repo, orphan_content)
359
360 # Write a valid commit with a snapshot that does NOT reference the orphan
361 snap = _make_snapshot(repo, {"other.py": b"other content"})
362 _make_commit(repo, snap.snapshot_id)
363
364 result = run_gc(repo, dry_run=False, grace_period_seconds=0)
365 assert orphan_oid in result.collected_ids, (
366 f"Orphaned object {orphan_oid[:8]} was not collected by GC. "
367 f"GC is too conservative."
368 )
369
370
371 # ──────────────────────────────────────────────────────────────────────────────
372 # Stress
373 # ──────────────────────────────────────────────────────────────────────────────
374
375 class TestGcCorruptStress:
376
377 def test_50_commits_5_corrupt_all_objects_retained(self, tmp_path: pathlib.Path) -> None:
378 """50 commits, 5 with corrupt snapshot_ids: all objects retained, no crash."""
379 repo = _make_repo(tmp_path)
380 commit_records = []
381 all_oids: set[str] = set()
382
383 for i in range(50):
384 content = f"content_{i}".encode()
385 snap = _make_snapshot(repo, {f"f{i}.py": content})
386 ts = _TS + datetime.timedelta(seconds=i)
387 commit = _make_commit(repo, snap.snapshot_id, message=f"commit {i}", ts=ts)
388 commit_records.append(commit)
389 all_oids.update(snap.manifest.values())
390
391 # Corrupt 5 commits (indices 10, 20, 30, 40, 49)
392 corrupt_indices = {10, 20, 30, 40, 49}
393 for idx in corrupt_indices:
394 _corrupt_commit_snapshot_id(
395 repo, commit_records[idx].commit_id, "9" * 64
396 )
397
398 reachable = _collect_reachable_objects(repo)
399
400 missing = [oid for oid in all_oids if oid not in reachable]
401 assert not missing, (
402 f"DATA LOSS: {len(missing)} object(s) not retained by GC despite "
403 f"being reachable from snapshots on disk. "
404 f"Missing: {[o[:8] for o in missing[:5]]}"
405 )
File History 4 commits
sha256:81ae324db5ad375fbfe4834c6fcb378312cafad3cc92dec5d3e5c427306621a2 fix: remove commit_exists filter from have anchors — server… Sonnet 4.6 patch 21 days ago
sha256:36c3cb3e76619d4c30a6d9bf81b5ec4ff148e30dcfed913e3114ca7b43b81c7e fix: rename objects→blobs in push client and all stale test… Sonnet 4.6 patch 22 days ago
sha256:c06a9b9b9fee26c68ea725b44d54b2c0a171301ce9de746d5b656617b4463a9a fix: repair four test failures from post-migration audit Sonnet 4.6 patch 29 days ago
sha256:1900655993c83c4107067375548a7be823e471d2515830842f1a12cba4bd3cdf fix: unified object store migration — idempotent writes, JS… Sonnet 4.6 minor 29 days ago