test_object_store_write_taxonomy.py
python
sha256:248464b6a2f758985cbef90f864fa62c61842be699d975d6e00b6a9509ef919c
fix(delta): detect blob-identical file renames for files wi…
Sonnet 4.6
patch
24 days ago
| 1 | """Object store write taxonomy — exhaustive correctness and safety tests. |
| 2 | |
| 3 | Every path that writes OR deletes objects is enumerated here. Each test |
| 4 | targets one invariant. If a test fails, it means a write or delete path is |
| 5 | broken; fix the production code, not the test. |
| 6 | |
| 7 | Write paths covered |
| 8 | ------------------- |
| 9 | W-1 write_object() — primary low-level write |
| 10 | W-2 write_object_from_path() — write from filesystem file |
| 11 | W-3 commit workflow — muse commit writes blobs then snapshot |
| 12 | W-4 shelf save — blobs written before shelf entry |
| 13 | W-5 fetch / pull _on_object — objects written on receive |
| 14 | W-6 apply_mpack — mpack unbundle writes objects |
| 15 | W-7 domain merge — plugin merge writes merged blob |
| 16 | W-8 hash_object --write — explicit low-level write |
| 17 | |
| 18 | Delete paths covered |
| 19 | -------------------- |
| 20 | D-1 gc non-full (default) — orphan sweep via snapshots walker |
| 21 | D-2 gc full — tight reachability from live refs |
| 22 | D-3 gc full multi-branch — objects on ALL branches survive |
| 23 | D-4 gc full object normalisation — sha256: prefixed IDs in reachable set |
| 24 | D-5 prune — mirrors gc non-full with expire window |
| 25 | D-6 maintenance gc task — calls run_gc with full=True |
| 26 | |
| 27 | Consistency invariants |
| 28 | ---------------------- |
| 29 | C-1 write → has_object True |
| 30 | C-2 write → object_state PRESENT |
| 31 | C-3 write → iter_stored_objects finds it |
| 32 | C-4 has_object and object_state agree |
| 33 | C-5 object_path canonical location |
| 34 | C-6 no write → object_state MISSING (no promisors) |
| 35 | C-7 no write → object_state PROMISED (promisors configured) |
| 36 | """ |
| 37 | |
| 38 | from __future__ import annotations |
| 39 | |
| 40 | import datetime |
| 41 | import json |
| 42 | import pathlib |
| 43 | import tempfile |
| 44 | from collections.abc import Mapping |
| 45 | |
| 46 | import pytest |
| 47 | |
| 48 | from muse.core.types import Manifest, blob_id, long_id, split_id |
| 49 | from muse.core.gc import run_gc, _collect_reachable_snapshots, _collect_reachable_commits |
| 50 | from muse.core.object_availability import ObjectState, load_promisor_remotes, object_state |
| 51 | from muse.core.object_store import ( |
| 52 | has_object, |
| 53 | iter_stored_objects, |
| 54 | object_path, |
| 55 | read_object, |
| 56 | write_object, |
| 57 | write_object_from_path, |
| 58 | ) |
| 59 | from muse.core.ids import hash_commit as compute_commit_id, hash_snapshot as compute_snapshot_id |
| 60 | from muse.core.commits import ( |
| 61 | CommitRecord, |
| 62 | write_commit, |
| 63 | ) |
| 64 | from muse.core.snapshots import ( |
| 65 | SnapshotRecord, |
| 66 | write_snapshot, |
| 67 | ) |
| 68 | from muse.core.shelf import write_shelf_entry |
| 69 | from muse.core.paths import muse_dir, objects_dir, ref_path, shelf_dir |
| 70 | |
| 71 | |
| 72 | # --------------------------------------------------------------------------- |
| 73 | # Shared helpers |
| 74 | # --------------------------------------------------------------------------- |
| 75 | |
| 76 | |
| 77 | def _repo(tmp_path: pathlib.Path) -> pathlib.Path: |
| 78 | """Minimal .muse repo skeleton.""" |
| 79 | muse = muse_dir(tmp_path) |
| 80 | for d in ("objects/sha256", "commits/sha256", "snapshots/sha256", "refs/heads"): |
| 81 | (muse / d).mkdir(parents=True, exist_ok=True) |
| 82 | (muse / "repo.json").write_text(json.dumps({"repo_id": "test-repo"})) |
| 83 | (muse / "HEAD").write_text("ref: refs/heads/main\n") |
| 84 | return tmp_path |
| 85 | |
| 86 | |
| 87 | def _write_blob(repo: pathlib.Path, content: bytes) -> str: |
| 88 | oid = blob_id(content) |
| 89 | write_object(repo, oid, content) |
| 90 | return oid |
| 91 | |
| 92 | |
| 93 | def _write_shelf_entry(repo: pathlib.Path, snapshot: Mapping[str, str]) -> None: |
| 94 | import json as _json |
| 95 | entry: dict[str, object] = { |
| 96 | "snapshot": dict(snapshot), |
| 97 | "branch": "main", |
| 98 | "created_at": "2026-01-01T00:00:00+00:00", |
| 99 | } |
| 100 | raw_bytes = _json.dumps(entry, sort_keys=True).encode() |
| 101 | _, hex_id = split_id(blob_id(raw_bytes)) |
| 102 | entry["id"] = f"sha256:{hex_id}" |
| 103 | write_shelf_entry(repo, entry) |
| 104 | |
| 105 | |
| 106 | def _write_snap(repo: pathlib.Path, manifest: Manifest) -> str: |
| 107 | snap_id = compute_snapshot_id(manifest) |
| 108 | write_snapshot(repo, SnapshotRecord(snapshot_id=snap_id, manifest=manifest)) |
| 109 | return snap_id |
| 110 | |
| 111 | |
| 112 | def _write_commit_on_branch( |
| 113 | repo: pathlib.Path, |
| 114 | snap_id: str, |
| 115 | branch: str = "main", |
| 116 | parent_id: str | None = None, |
| 117 | message: str = "test", |
| 118 | ) -> str: |
| 119 | committed_at = datetime.datetime(2026, 1, 1, tzinfo=datetime.timezone.utc) |
| 120 | parent_ids = [parent_id] if parent_id else [] |
| 121 | commit_id = compute_commit_id( |
| 122 | parent_ids=parent_ids, |
| 123 | snapshot_id=snap_id, |
| 124 | message=message, |
| 125 | committed_at_iso=committed_at.isoformat(), |
| 126 | ) |
| 127 | write_commit( |
| 128 | repo, |
| 129 | CommitRecord( |
| 130 | commit_id=commit_id, |
| 131 | branch=branch, |
| 132 | snapshot_id=snap_id, |
| 133 | message=message, |
| 134 | committed_at=committed_at, |
| 135 | parent_commit_id=parent_id, |
| 136 | ), |
| 137 | ) |
| 138 | ref = ref_path(repo, branch) |
| 139 | ref.parent.mkdir(parents=True, exist_ok=True) |
| 140 | ref.write_text(commit_id) |
| 141 | return commit_id |
| 142 | |
| 143 | |
| 144 | # --------------------------------------------------------------------------- |
| 145 | # W-1 write_object — canonical path |
| 146 | # --------------------------------------------------------------------------- |
| 147 | |
| 148 | |
| 149 | class TestWriteObject: |
| 150 | """W-1: write_object() places objects at the canonical sha256/ path.""" |
| 151 | |
| 152 | def test_lands_under_sha256_dir(self, tmp_path: pathlib.Path) -> None: |
| 153 | repo = _repo(tmp_path) |
| 154 | oid = blob_id(b"hello") |
| 155 | write_object(repo, oid, b"hello") |
| 156 | p = object_path(repo, oid) |
| 157 | assert p.exists() |
| 158 | assert p.parent.parent.name == "sha256" |
| 159 | |
| 160 | def test_shard_prefix_is_first_two_hex_chars(self, tmp_path: pathlib.Path) -> None: |
| 161 | repo = _repo(tmp_path) |
| 162 | content = b"shard-check" |
| 163 | oid = blob_id(content) |
| 164 | write_object(repo, oid, content) |
| 165 | p = object_path(repo, oid) |
| 166 | hex_id = split_id(oid)[1] |
| 167 | assert p.parent.name == hex_id[:2] |
| 168 | |
| 169 | def test_filename_is_remaining_62_hex_chars(self, tmp_path: pathlib.Path) -> None: |
| 170 | repo = _repo(tmp_path) |
| 171 | content = b"filename-check" |
| 172 | oid = blob_id(content) |
| 173 | write_object(repo, oid, content) |
| 174 | p = object_path(repo, oid) |
| 175 | hex_id = split_id(oid)[1] |
| 176 | assert p.name == hex_id[2:] |
| 177 | |
| 178 | def test_idempotent_returns_false_on_second_write( |
| 179 | self, tmp_path: pathlib.Path |
| 180 | ) -> None: |
| 181 | repo = _repo(tmp_path) |
| 182 | oid = blob_id(b"idempotent") |
| 183 | assert write_object(repo, oid, b"idempotent") is True |
| 184 | assert write_object(repo, oid, b"idempotent") is False |
| 185 | |
| 186 | def test_content_verifiable_after_write(self, tmp_path: pathlib.Path) -> None: |
| 187 | repo = _repo(tmp_path) |
| 188 | content = b"verifiable content" |
| 189 | oid = blob_id(content) |
| 190 | write_object(repo, oid, content) |
| 191 | assert read_object(repo, oid) == content |
| 192 | |
| 193 | def test_rejects_wrong_content(self, tmp_path: pathlib.Path) -> None: |
| 194 | repo = _repo(tmp_path) |
| 195 | oid = blob_id(b"correct") |
| 196 | with pytest.raises(ValueError): |
| 197 | write_object(repo, oid, b"wrong content") |
| 198 | |
| 199 | def test_rejects_bare_hex_object_id(self, tmp_path: pathlib.Path) -> None: |
| 200 | repo = _repo(tmp_path) |
| 201 | bare_hex = split_id(blob_id(b"bare"))[1] |
| 202 | with pytest.raises((ValueError, Exception)): |
| 203 | write_object(repo, bare_hex, b"bare") |
| 204 | |
| 205 | |
| 206 | # --------------------------------------------------------------------------- |
| 207 | # W-2 write_object_from_path — canonical path |
| 208 | # --------------------------------------------------------------------------- |
| 209 | |
| 210 | |
| 211 | class TestWriteObjectFromPath: |
| 212 | """W-2: write_object_from_path() writes from a file and lands at canonical path.""" |
| 213 | |
| 214 | def test_writes_to_sha256_dir(self, tmp_path: pathlib.Path) -> None: |
| 215 | repo = _repo(tmp_path) |
| 216 | src = tmp_path / "source.txt" |
| 217 | content = b"from-path content" |
| 218 | src.write_bytes(content) |
| 219 | oid = blob_id(content) |
| 220 | write_object_from_path(repo, oid, src) |
| 221 | p = object_path(repo, oid) |
| 222 | assert p.exists() |
| 223 | assert p.parent.parent.name == "sha256" |
| 224 | |
| 225 | def test_oid_matches_blob_id(self, tmp_path: pathlib.Path) -> None: |
| 226 | repo = _repo(tmp_path) |
| 227 | content = b"oid must match blob_id" |
| 228 | src = tmp_path / "f.txt" |
| 229 | src.write_bytes(content) |
| 230 | oid = blob_id(content) |
| 231 | write_object_from_path(repo, oid, src) |
| 232 | assert oid == blob_id(content) |
| 233 | |
| 234 | def test_content_readable_after_write(self, tmp_path: pathlib.Path) -> None: |
| 235 | repo = _repo(tmp_path) |
| 236 | content = b"readable after write" |
| 237 | src = tmp_path / "r.txt" |
| 238 | src.write_bytes(content) |
| 239 | oid = blob_id(content) |
| 240 | write_object_from_path(repo, oid, src) |
| 241 | assert read_object(repo, oid) == content |
| 242 | |
| 243 | |
| 244 | # --------------------------------------------------------------------------- |
| 245 | # C-1 … C-7 Consistency invariants |
| 246 | # --------------------------------------------------------------------------- |
| 247 | |
| 248 | |
| 249 | class TestConsistencyInvariants: |
| 250 | """C-1 through C-7: consistency between write, has_object, object_state, iter.""" |
| 251 | |
| 252 | def test_c1_has_object_true_after_write(self, tmp_path: pathlib.Path) -> None: |
| 253 | repo = _repo(tmp_path) |
| 254 | oid = _write_blob(repo, b"c1") |
| 255 | assert has_object(repo, oid) |
| 256 | |
| 257 | def test_c2_object_state_present_after_write(self, tmp_path: pathlib.Path) -> None: |
| 258 | repo = _repo(tmp_path) |
| 259 | oid = _write_blob(repo, b"c2") |
| 260 | state = object_state(repo, oid, []) |
| 261 | assert state == ObjectState.PRESENT |
| 262 | |
| 263 | def test_c3_iter_stored_objects_finds_written( |
| 264 | self, tmp_path: pathlib.Path |
| 265 | ) -> None: |
| 266 | repo = _repo(tmp_path) |
| 267 | oid = _write_blob(repo, b"c3") |
| 268 | found = {o for o, _ in iter_stored_objects(repo)} |
| 269 | assert oid in found |
| 270 | |
| 271 | def test_c4_has_object_and_object_state_agree_present( |
| 272 | self, tmp_path: pathlib.Path |
| 273 | ) -> None: |
| 274 | repo = _repo(tmp_path) |
| 275 | oid = _write_blob(repo, b"c4-present") |
| 276 | assert has_object(repo, oid) |
| 277 | assert object_state(repo, oid, []) == ObjectState.PRESENT |
| 278 | |
| 279 | def test_c4_has_object_and_object_state_agree_absent( |
| 280 | self, tmp_path: pathlib.Path |
| 281 | ) -> None: |
| 282 | repo = _repo(tmp_path) |
| 283 | oid = blob_id(b"never written") |
| 284 | assert not has_object(repo, oid) |
| 285 | assert object_state(repo, oid, []) == ObjectState.MISSING |
| 286 | |
| 287 | def test_c5_object_path_canonical_location(self, tmp_path: pathlib.Path) -> None: |
| 288 | repo = _repo(tmp_path) |
| 289 | content = b"canonical" |
| 290 | oid = blob_id(content) |
| 291 | write_object(repo, oid, content) |
| 292 | p = object_path(repo, oid) |
| 293 | hex_id = split_id(oid)[1] |
| 294 | expected = objects_dir(repo) / "sha256" / hex_id[:2] / hex_id[2:] |
| 295 | assert p == expected |
| 296 | assert p.exists() |
| 297 | |
| 298 | def test_c6_object_state_missing_when_absent_no_promisors( |
| 299 | self, tmp_path: pathlib.Path |
| 300 | ) -> None: |
| 301 | repo = _repo(tmp_path) |
| 302 | oid = blob_id(b"missing") |
| 303 | state = object_state(repo, oid, promisor_remotes=[]) |
| 304 | assert state == ObjectState.MISSING |
| 305 | |
| 306 | def test_c7_object_state_promised_when_absent_with_promisor( |
| 307 | self, tmp_path: pathlib.Path |
| 308 | ) -> None: |
| 309 | repo = _repo(tmp_path) |
| 310 | oid = blob_id(b"promised") |
| 311 | state = object_state(repo, oid, promisor_remotes=["staging"]) |
| 312 | assert state == ObjectState.PROMISED |
| 313 | |
| 314 | def test_c7_object_state_present_beats_promisor( |
| 315 | self, tmp_path: pathlib.Path |
| 316 | ) -> None: |
| 317 | """A present object is PRESENT even when promisors are configured.""" |
| 318 | repo = _repo(tmp_path) |
| 319 | oid = _write_blob(repo, b"present beats promisor") |
| 320 | state = object_state(repo, oid, promisor_remotes=["staging"]) |
| 321 | assert state == ObjectState.PRESENT |
| 322 | |
| 323 | |
| 324 | # --------------------------------------------------------------------------- |
| 325 | # D-1 GC non-full — orphan sweep |
| 326 | # --------------------------------------------------------------------------- |
| 327 | |
| 328 | |
| 329 | class TestGcNonFull: |
| 330 | """D-1: default (non-full) GC sweeps orphans but retains all reachable objects.""" |
| 331 | |
| 332 | def test_orphan_collected(self, tmp_path: pathlib.Path) -> None: |
| 333 | repo = _repo(tmp_path) |
| 334 | oid = _write_blob(repo, b"orphan") |
| 335 | run_gc(repo, grace_period_seconds=0) |
| 336 | assert not object_path(repo, oid).exists() |
| 337 | |
| 338 | def test_reachable_via_snapshot_survives(self, tmp_path: pathlib.Path) -> None: |
| 339 | repo = _repo(tmp_path) |
| 340 | oid = _write_blob(repo, b"reachable") |
| 341 | snap_id = _write_snap(repo, {"f.txt": oid}) |
| 342 | _write_commit_on_branch(repo, snap_id) |
| 343 | run_gc(repo, grace_period_seconds=0) |
| 344 | assert object_path(repo, oid).exists() |
| 345 | |
| 346 | def test_reachable_on_non_default_branch_survives( |
| 347 | self, tmp_path: pathlib.Path |
| 348 | ) -> None: |
| 349 | repo = _repo(tmp_path) |
| 350 | oid = _write_blob(repo, b"non-default branch") |
| 351 | snap_id = _write_snap(repo, {"g.txt": oid}) |
| 352 | _write_commit_on_branch(repo, snap_id, branch="dev") |
| 353 | run_gc(repo, grace_period_seconds=0) |
| 354 | assert object_path(repo, oid).exists() |
| 355 | |
| 356 | def test_multiple_orphans_all_collected(self, tmp_path: pathlib.Path) -> None: |
| 357 | repo = _repo(tmp_path) |
| 358 | oids = [_write_blob(repo, f"o{i}".encode()) for i in range(5)] |
| 359 | result = run_gc(repo, grace_period_seconds=0) |
| 360 | assert result.collected_count == 5 |
| 361 | for oid in oids: |
| 362 | assert not object_path(repo, oid).exists() |
| 363 | |
| 364 | def test_grace_period_protects_recent_objects( |
| 365 | self, tmp_path: pathlib.Path |
| 366 | ) -> None: |
| 367 | repo = _repo(tmp_path) |
| 368 | oid = _write_blob(repo, b"fresh orphan") |
| 369 | result = run_gc(repo, grace_period_seconds=3600) |
| 370 | assert result.collected_count == 0 |
| 371 | assert object_path(repo, oid).exists() |
| 372 | |
| 373 | |
| 374 | # --------------------------------------------------------------------------- |
| 375 | # D-2 GC full — tight reachability |
| 376 | # --------------------------------------------------------------------------- |
| 377 | |
| 378 | |
| 379 | class TestGcFull: |
| 380 | """D-2: gc full mode uses tight reachability but must still retain all live objects.""" |
| 381 | |
| 382 | def test_reachable_object_survives_full_gc(self, tmp_path: pathlib.Path) -> None: |
| 383 | repo = _repo(tmp_path) |
| 384 | oid = _write_blob(repo, b"live object") |
| 385 | snap_id = _write_snap(repo, {"live.txt": oid}) |
| 386 | _write_commit_on_branch(repo, snap_id) |
| 387 | result = run_gc(repo, full=True, grace_period_seconds=0) |
| 388 | assert result.collected_count == 0 |
| 389 | assert object_path(repo, oid).exists() |
| 390 | |
| 391 | def test_orphan_collected_by_full_gc(self, tmp_path: pathlib.Path) -> None: |
| 392 | repo = _repo(tmp_path) |
| 393 | # One reachable, one orphan. |
| 394 | live_oid = _write_blob(repo, b"live") |
| 395 | snap_id = _write_snap(repo, {"f.txt": live_oid}) |
| 396 | _write_commit_on_branch(repo, snap_id) |
| 397 | orphan_oid = _write_blob(repo, b"orphan") |
| 398 | result = run_gc(repo, full=True, grace_period_seconds=0) |
| 399 | assert result.collected_count == 1 |
| 400 | assert not object_path(repo, orphan_oid).exists() |
| 401 | assert object_path(repo, live_oid).exists() |
| 402 | |
| 403 | def test_full_gc_dry_run_does_not_delete(self, tmp_path: pathlib.Path) -> None: |
| 404 | repo = _repo(tmp_path) |
| 405 | oid = _write_blob(repo, b"dry-run orphan") |
| 406 | result = run_gc(repo, full=True, dry_run=True, grace_period_seconds=0) |
| 407 | assert result.dry_run is True |
| 408 | assert object_path(repo, oid).exists() |
| 409 | |
| 410 | |
| 411 | # --------------------------------------------------------------------------- |
| 412 | # D-3 GC full multi-branch — objects on ALL live branches survive |
| 413 | # --------------------------------------------------------------------------- |
| 414 | |
| 415 | |
| 416 | class TestGcFullMultiBranch: |
| 417 | """D-3: full GC must retain objects reachable from every live branch, not just HEAD.""" |
| 418 | |
| 419 | def test_object_on_secondary_branch_survives_full_gc( |
| 420 | self, tmp_path: pathlib.Path |
| 421 | ) -> None: |
| 422 | repo = _repo(tmp_path) |
| 423 | # main branch object |
| 424 | main_oid = _write_blob(repo, b"main content") |
| 425 | main_snap = _write_snap(repo, {"main.txt": main_oid}) |
| 426 | _write_commit_on_branch(repo, main_snap, branch="main") |
| 427 | # dev branch object (different content) |
| 428 | dev_oid = _write_blob(repo, b"dev content") |
| 429 | dev_snap = _write_snap(repo, {"dev.txt": dev_oid}) |
| 430 | _write_commit_on_branch(repo, dev_snap, branch="dev") |
| 431 | result = run_gc(repo, full=True, grace_period_seconds=0) |
| 432 | assert result.collected_count == 0 |
| 433 | assert object_path(repo, main_oid).exists(), "main branch object deleted!" |
| 434 | assert object_path(repo, dev_oid).exists(), "dev branch object deleted!" |
| 435 | |
| 436 | def test_object_on_three_branches_all_survive( |
| 437 | self, tmp_path: pathlib.Path |
| 438 | ) -> None: |
| 439 | repo = _repo(tmp_path) |
| 440 | oids = [] |
| 441 | for branch in ("main", "dev", "feat/x"): |
| 442 | oid = _write_blob(repo, f"content on {branch}".encode()) |
| 443 | snap_id = _write_snap(repo, {f"{branch}.txt": oid}) |
| 444 | _write_commit_on_branch(repo, snap_id, branch=branch) |
| 445 | oids.append(oid) |
| 446 | result = run_gc(repo, full=True, grace_period_seconds=0) |
| 447 | assert result.collected_count == 0 |
| 448 | for oid in oids: |
| 449 | assert object_path(repo, oid).exists() |
| 450 | |
| 451 | def test_shared_object_referenced_by_two_branches_survives( |
| 452 | self, tmp_path: pathlib.Path |
| 453 | ) -> None: |
| 454 | """If main and dev both reference the same object, full GC must keep it.""" |
| 455 | repo = _repo(tmp_path) |
| 456 | shared_oid = _write_blob(repo, b"shared content") |
| 457 | for branch in ("main", "dev"): |
| 458 | snap_id = _write_snap(repo, {"shared.txt": shared_oid}) |
| 459 | _write_commit_on_branch(repo, snap_id, branch=branch) |
| 460 | result = run_gc(repo, full=True, grace_period_seconds=0) |
| 461 | assert result.collected_count == 0 |
| 462 | assert object_path(repo, shared_oid).exists() |
| 463 | |
| 464 | |
| 465 | # --------------------------------------------------------------------------- |
| 466 | # D-4 GC full object ID normalisation |
| 467 | # --------------------------------------------------------------------------- |
| 468 | |
| 469 | |
| 470 | class TestGcFullObjectNormalisation: |
| 471 | """D-4: full GC reachability set uses sha256:-prefixed IDs throughout. |
| 472 | |
| 473 | This is the critical invariant that ensures the reachable-objects set |
| 474 | (built from snapshot manifests) matches the stored-objects set |
| 475 | (built from iter_stored_objects). A mismatch would cause live objects |
| 476 | to be incorrectly classified as unreachable and deleted. |
| 477 | """ |
| 478 | |
| 479 | def test_reachable_set_uses_prefixed_ids(self, tmp_path: pathlib.Path) -> None: |
| 480 | """_collect_reachable_snapshots returns sha256:-prefixed object IDs.""" |
| 481 | repo = _repo(tmp_path) |
| 482 | oid = _write_blob(repo, b"normalisation check") |
| 483 | snap_id = _write_snap(repo, {"f.txt": oid}) |
| 484 | _write_commit_on_branch(repo, snap_id) |
| 485 | reachable_commits = _collect_reachable_commits(repo) |
| 486 | _, reachable_objs = _collect_reachable_snapshots(repo, reachable_commits) |
| 487 | # Every entry must carry the sha256: prefix. |
| 488 | for obj_id in reachable_objs: |
| 489 | assert obj_id.startswith("sha256:"), ( |
| 490 | f"Reachable object ID missing sha256: prefix: {obj_id!r}" |
| 491 | ) |
| 492 | |
| 493 | def test_iter_stored_objects_uses_prefixed_ids( |
| 494 | self, tmp_path: pathlib.Path |
| 495 | ) -> None: |
| 496 | """iter_stored_objects returns sha256:-prefixed object IDs.""" |
| 497 | repo = _repo(tmp_path) |
| 498 | _write_blob(repo, b"stored check") |
| 499 | for oid, _ in iter_stored_objects(repo): |
| 500 | assert oid.startswith("sha256:"), ( |
| 501 | f"iter_stored_objects returned unprefixed ID: {oid!r}" |
| 502 | ) |
| 503 | |
| 504 | def test_reachable_set_matches_stored_set_for_live_objects( |
| 505 | self, tmp_path: pathlib.Path |
| 506 | ) -> None: |
| 507 | """Every live object must appear in both sets with the same ID form.""" |
| 508 | repo = _repo(tmp_path) |
| 509 | oids = set() |
| 510 | for i in range(3): |
| 511 | oid = _write_blob(repo, f"live {i}".encode()) |
| 512 | oids.add(oid) |
| 513 | snap_id = _write_snap(repo, {f"f{i}.txt": o for i, o in enumerate(oids)}) |
| 514 | _write_commit_on_branch(repo, snap_id) |
| 515 | reachable_commits = _collect_reachable_commits(repo) |
| 516 | _, reachable_objs = _collect_reachable_snapshots(repo, reachable_commits) |
| 517 | stored = {o for o, _ in iter_stored_objects(repo)} |
| 518 | # All live objects must be in both sets. |
| 519 | for oid in oids: |
| 520 | assert oid in reachable_objs, f"{oid} missing from reachable set" |
| 521 | assert oid in stored, f"{oid} missing from stored set" |
| 522 | |
| 523 | def test_full_gc_does_not_delete_prefixed_manifest_objects( |
| 524 | self, tmp_path: pathlib.Path |
| 525 | ) -> None: |
| 526 | """Regression: full GC must not delete objects whose IDs use sha256: prefix in the manifest.""" |
| 527 | repo = _repo(tmp_path) |
| 528 | contents = [f"file {i} content".encode() for i in range(5)] |
| 529 | manifest = {} |
| 530 | for i, c in enumerate(contents): |
| 531 | oid = _write_blob(repo, c) |
| 532 | manifest[f"file{i}.py"] = oid |
| 533 | # Confirm the manifest value is prefixed. |
| 534 | assert oid.startswith("sha256:"), f"blob_id returned unprefixed: {oid}" |
| 535 | snap_id = _write_snap(repo, manifest) |
| 536 | _write_commit_on_branch(repo, snap_id) |
| 537 | result = run_gc(repo, full=True, grace_period_seconds=0) |
| 538 | assert result.collected_count == 0, ( |
| 539 | f"Full GC deleted {result.collected_count} live objects: {result.collected_ids}" |
| 540 | ) |
| 541 | for oid in manifest.values(): |
| 542 | assert object_path(repo, oid).exists(), f"Full GC deleted live object {oid}" |
| 543 | |
| 544 | def test_full_gc_retains_large_manifest(self, tmp_path: pathlib.Path) -> None: |
| 545 | """Full GC must not delete any of N live objects in a large snapshot.""" |
| 546 | repo = _repo(tmp_path) |
| 547 | n = 50 |
| 548 | manifest = {} |
| 549 | for i in range(n): |
| 550 | oid = _write_blob(repo, f"large manifest entry {i}".encode()) |
| 551 | manifest[f"src/file_{i:03d}.py"] = oid |
| 552 | snap_id = _write_snap(repo, manifest) |
| 553 | _write_commit_on_branch(repo, snap_id) |
| 554 | result = run_gc(repo, full=True, grace_period_seconds=0) |
| 555 | assert result.collected_count == 0, ( |
| 556 | f"Full GC deleted objects from large manifest: {result.collected_ids[:5]}" |
| 557 | ) |
| 558 | |
| 559 | |
| 560 | # --------------------------------------------------------------------------- |
| 561 | # D-5 Prune — mirrors non-full GC with expire window |
| 562 | # --------------------------------------------------------------------------- |
| 563 | |
| 564 | |
| 565 | class TestPruneSafety: |
| 566 | """D-5: muse prune must never delete reachable objects.""" |
| 567 | |
| 568 | def test_prune_does_not_remove_committed_object( |
| 569 | self, tmp_path: pathlib.Path |
| 570 | ) -> None: |
| 571 | """Objects referenced by commits must survive prune.""" |
| 572 | from muse.core.gc import run_gc # prune delegates to gc |
| 573 | repo = _repo(tmp_path) |
| 574 | oid = _write_blob(repo, b"committed object") |
| 575 | snap_id = _write_snap(repo, {"f.txt": oid}) |
| 576 | _write_commit_on_branch(repo, snap_id) |
| 577 | # Non-full GC is what prune uses. |
| 578 | result = run_gc(repo, grace_period_seconds=0) |
| 579 | assert result.collected_count == 0 |
| 580 | assert object_path(repo, oid).exists() |
| 581 | |
| 582 | |
| 583 | # --------------------------------------------------------------------------- |
| 584 | # D-6 Maintenance gc task passes full=True |
| 585 | # --------------------------------------------------------------------------- |
| 586 | |
| 587 | |
| 588 | class TestMaintenanceGcUsesFull: |
| 589 | """D-6: the maintenance 'gc' task must invoke run_gc with full=True.""" |
| 590 | |
| 591 | def test_maintenance_gc_task_calls_run_gc_with_full_true( |
| 592 | self, tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch |
| 593 | ) -> None: |
| 594 | """Confirm _run_gc (the maintenance task) passes full=True to run_gc.""" |
| 595 | from muse.cli.commands import maintenance as maint_mod |
| 596 | |
| 597 | calls: list[dict] = [] |
| 598 | |
| 599 | def _capture_run_gc(root: pathlib.Path, *, dry_run: bool, grace_period_seconds: float, full: bool) -> "GcResult": |
| 600 | calls.append({"full": full, "dry_run": dry_run}) |
| 601 | from muse.core.gc import GcResult |
| 602 | return GcResult(dry_run=dry_run, grace_period_seconds=grace_period_seconds, full=full) |
| 603 | |
| 604 | monkeypatch.setattr(maint_mod, "run_gc", _capture_run_gc) |
| 605 | repo = _repo(tmp_path) |
| 606 | maint_mod._run_gc(repo) |
| 607 | assert calls, "run_gc was never called by maintenance _run_gc" |
| 608 | assert calls[0]["full"] is True, ( |
| 609 | f"Maintenance gc must pass full=True, got full={calls[0]['full']}" |
| 610 | ) |
| 611 | |
| 612 | def test_maintenance_gc_retains_all_reachable_objects( |
| 613 | self, tmp_path: pathlib.Path |
| 614 | ) -> None: |
| 615 | """End-to-end: running the maintenance gc task must not delete live objects.""" |
| 616 | from muse.cli.commands.maintenance import _run_gc as maintenance_run_gc |
| 617 | |
| 618 | repo = _repo(tmp_path) |
| 619 | # Write objects on two branches. |
| 620 | for branch, content in (("main", b"main obj"), ("dev", b"dev obj")): |
| 621 | oid = _write_blob(repo, content) |
| 622 | snap_id = _write_snap(repo, {f"{branch}.py": oid}) |
| 623 | _write_commit_on_branch(repo, snap_id, branch=branch) |
| 624 | |
| 625 | maintenance_run_gc(repo, dry_run=False) |
| 626 | |
| 627 | # Both objects must survive. |
| 628 | for content in (b"main obj", b"dev obj"): |
| 629 | oid = blob_id(content) |
| 630 | assert object_path(repo, oid).exists(), ( |
| 631 | f"Maintenance gc deleted live object {oid}" |
| 632 | ) |
| 633 | |
| 634 | |
| 635 | # --------------------------------------------------------------------------- |
| 636 | # W-3 Commit workflow — objects written before commit record |
| 637 | # --------------------------------------------------------------------------- |
| 638 | |
| 639 | |
| 640 | class TestCommitWritePath: |
| 641 | """W-3: the commit workflow must write blobs to the object store at the |
| 642 | canonical path before creating the commit record. |
| 643 | |
| 644 | We test this at the store level (not the CLI) since the CLI requires a |
| 645 | full working-tree environment. |
| 646 | """ |
| 647 | |
| 648 | def test_snapshot_manifest_objects_at_canonical_path( |
| 649 | self, tmp_path: pathlib.Path |
| 650 | ) -> None: |
| 651 | """Objects written for a commit land at the canonical sha256/ path.""" |
| 652 | repo = _repo(tmp_path) |
| 653 | contents = {f"src/file{i}.py": f"content {i}".encode() for i in range(3)} |
| 654 | manifest = {} |
| 655 | for path, content in contents.items(): |
| 656 | oid = blob_id(content) |
| 657 | write_object(repo, oid, content) |
| 658 | manifest[path] = oid |
| 659 | snap_id = _write_snap(repo, manifest) |
| 660 | _write_commit_on_branch(repo, snap_id) |
| 661 | # All objects reachable and at correct path. |
| 662 | for oid in manifest.values(): |
| 663 | p = object_path(repo, oid) |
| 664 | assert p.exists() |
| 665 | assert p.parent.parent.name == "sha256" |
| 666 | |
| 667 | def test_all_manifest_objects_survive_full_gc( |
| 668 | self, tmp_path: pathlib.Path |
| 669 | ) -> None: |
| 670 | """Objects in a committed snapshot must all survive full GC.""" |
| 671 | repo = _repo(tmp_path) |
| 672 | manifest = {} |
| 673 | for i in range(10): |
| 674 | content = f"committed file {i}".encode() |
| 675 | oid = blob_id(content) |
| 676 | write_object(repo, oid, content) |
| 677 | manifest[f"file{i}.py"] = oid |
| 678 | snap_id = _write_snap(repo, manifest) |
| 679 | _write_commit_on_branch(repo, snap_id) |
| 680 | result = run_gc(repo, full=True, grace_period_seconds=0) |
| 681 | assert result.collected_count == 0 |
| 682 | for oid in manifest.values(): |
| 683 | assert object_path(repo, oid).exists() |
| 684 | |
| 685 | |
| 686 | # --------------------------------------------------------------------------- |
| 687 | # W-4 Shelf save — blobs written before shelf entry |
| 688 | # --------------------------------------------------------------------------- |
| 689 | |
| 690 | |
| 691 | class TestShelfWritePath: |
| 692 | """W-4: shelf objects must survive GC even before they are committed.""" |
| 693 | |
| 694 | def test_shelved_objects_survive_non_full_gc( |
| 695 | self, tmp_path: pathlib.Path |
| 696 | ) -> None: |
| 697 | repo = _repo(tmp_path) |
| 698 | shelf_oid = _write_blob(repo, b"shelved work") |
| 699 | _write_shelf_entry(repo, {"work.py": shelf_oid}) |
| 700 | result = run_gc(repo, grace_period_seconds=0) |
| 701 | assert result.collected_count == 0 |
| 702 | assert object_path(repo, shelf_oid).exists() |
| 703 | |
| 704 | def test_shelved_objects_survive_full_gc(self, tmp_path: pathlib.Path) -> None: |
| 705 | repo = _repo(tmp_path) |
| 706 | shelf_oid = _write_blob(repo, b"shelved full gc") |
| 707 | _write_shelf_entry(repo, {"wip.py": shelf_oid}) |
| 708 | result = run_gc(repo, full=True, grace_period_seconds=0) |
| 709 | assert result.collected_count == 0 |
| 710 | assert object_path(repo, shelf_oid).exists() |
File History
1 commit
sha256:248464b6a2f758985cbef90f864fa62c61842be699d975d6e00b6a9509ef919c
fix(delta): detect blob-identical file renames for files wi…
Sonnet 4.6
patch
24 days ago