test_object_store_write_taxonomy.py
file-level
1
files
1
commits
0
hotspots
0
π§ dead
0
π₯ blast risk
| 1 | """Object store write taxonomy β exhaustive correctness and safety tests. |
| 2 | |
| 3 | Every path that writes OR deletes objects is enumerated here. Each test |
| 4 | targets one invariant. If a test fails, it means a write or delete path is |
| 5 | broken; fix the production code, not the test. |
| 6 | |
| 7 | Write paths covered |
| 8 | ------------------- |
| 9 | W-1 write_object() β primary low-level write |
| 10 | W-2 write_object_from_path() β write from filesystem file |
| 11 | W-3 commit workflow β muse commit writes blobs then snapshot |
| 12 | W-4 shelf save β blobs written before shelf entry |
| 13 | W-5 fetch / pull _on_object β objects written on receive |
| 14 | W-6 apply_mpack β mpack unbundle writes objects |
| 15 | W-7 domain merge β plugin merge writes merged blob |
| 16 | W-8 hash_object --write β explicit low-level write |
| 17 | |
| 18 | Delete paths covered |
| 19 | -------------------- |
| 20 | D-1 gc non-full (default) β orphan sweep via snapshots walker |
| 21 | D-2 gc full β tight reachability from live refs |
| 22 | D-3 gc full multi-branch β objects on ALL branches survive |
| 23 | D-4 gc full object normalisation β sha256: prefixed IDs in reachable set |
| 24 | D-5 prune β mirrors gc non-full with expire window |
| 25 | D-6 maintenance gc task β calls run_gc with full=True |
| 26 | |
| 27 | Consistency invariants |
| 28 | ---------------------- |
| 29 | C-1 write β has_object True |
| 30 | C-2 write β object_state PRESENT |
| 31 | C-3 write β iter_stored_objects finds it |
| 32 | C-4 has_object and object_state agree |
| 33 | C-5 object_path canonical location |
| 34 | C-6 no write β object_state MISSING (no promisors) |
| 35 | C-7 no write β object_state PROMISED (promisors configured) |
| 36 | """ |
| 37 | |
| 38 | from __future__ import annotations |
| 39 | |
| 40 | import datetime |
| 41 | import json |
| 42 | import pathlib |
| 43 | import tempfile |
| 44 | from collections.abc import Mapping |
| 45 | from typing import TypedDict |
| 46 | |
| 47 | import pytest |
| 48 | |
| 49 | from muse.core.types import Manifest, blob_id, long_id, split_id |
| 50 | from muse.core.gc import run_gc, _collect_reachable_snapshots, _collect_reachable_commits |
| 51 | from muse.core.object_availability import ObjectState, load_promisor_remotes, object_state |
| 52 | from muse.core.object_store import ( |
| 53 | has_object, |
| 54 | iter_stored_objects, |
| 55 | object_path, |
| 56 | read_object, |
| 57 | write_object, |
| 58 | write_object_from_path, |
| 59 | ) |
| 60 | from muse.core.ids import hash_commit as compute_commit_id, hash_snapshot as compute_snapshot_id |
| 61 | from muse.core.commits import ( |
| 62 | CommitRecord, |
| 63 | write_commit, |
| 64 | ) |
| 65 | from muse.core.snapshots import ( |
| 66 | SnapshotRecord, |
| 67 | write_snapshot, |
| 68 | ) |
| 69 | from muse.core.shelf import write_shelf_entry |
| 70 | from muse.core.paths import muse_dir, objects_dir, ref_path, shelf_dir |
| 71 | |
| 72 | |
| 73 | # --------------------------------------------------------------------------- |
| 74 | # Shared helpers |
| 75 | # --------------------------------------------------------------------------- |
| 76 | |
| 77 | |
| 78 | def _repo(tmp_path: pathlib.Path) -> pathlib.Path: |
| 79 | """Minimal .muse repo skeleton.""" |
| 80 | muse = muse_dir(tmp_path) |
| 81 | for d in ("objects/sha256", "commits/sha256", "snapshots/sha256", "refs/heads"): |
| 82 | (muse / d).mkdir(parents=True, exist_ok=True) |
| 83 | (muse / "repo.json").write_text(json.dumps({"repo_id": "test-repo"})) |
| 84 | (muse / "HEAD").write_text("ref: refs/heads/main\n") |
| 85 | return tmp_path |
| 86 | |
| 87 | |
| 88 | def _write_blob(repo: pathlib.Path, content: bytes) -> str: |
| 89 | oid = blob_id(content) |
| 90 | write_object(repo, oid, content) |
| 91 | return oid |
| 92 | |
| 93 | |
| 94 | class _ShelfEntryData(TypedDict): |
| 95 | snapshot: dict[str, str] |
| 96 | branch: str |
| 97 | created_at: str |
| 98 | |
| 99 | |
| 100 | def _write_shelf_entry(repo: pathlib.Path, snapshot: Mapping[str, str]) -> None: |
| 101 | import json as _json |
| 102 | entry: _ShelfEntryData = { |
| 103 | "snapshot": dict(snapshot), |
| 104 | "branch": "main", |
| 105 | "created_at": "2026-01-01T00:00:00+00:00", |
| 106 | } |
| 107 | raw_bytes = _json.dumps(entry, sort_keys=True).encode() |
| 108 | _, hex_id = split_id(blob_id(raw_bytes)) |
| 109 | entry["id"] = f"sha256:{hex_id}" |
| 110 | write_shelf_entry(repo, entry) |
| 111 | |
| 112 | |
| 113 | def _write_snap(repo: pathlib.Path, manifest: Manifest) -> str: |
| 114 | snap_id = compute_snapshot_id(manifest) |
| 115 | write_snapshot(repo, SnapshotRecord(snapshot_id=snap_id, manifest=manifest)) |
| 116 | return snap_id |
| 117 | |
| 118 | |
| 119 | def _write_commit_on_branch( |
| 120 | repo: pathlib.Path, |
| 121 | snap_id: str, |
| 122 | branch: str = "main", |
| 123 | parent_id: str | None = None, |
| 124 | message: str = "test", |
| 125 | ) -> str: |
| 126 | committed_at = datetime.datetime(2026, 1, 1, tzinfo=datetime.timezone.utc) |
| 127 | parent_ids = [parent_id] if parent_id else [] |
| 128 | commit_id = compute_commit_id( |
| 129 | parent_ids=parent_ids, |
| 130 | snapshot_id=snap_id, |
| 131 | message=message, |
| 132 | committed_at_iso=committed_at.isoformat(), |
| 133 | ) |
| 134 | write_commit( |
| 135 | repo, |
| 136 | CommitRecord( |
| 137 | commit_id=commit_id, |
| 138 | branch=branch, |
| 139 | snapshot_id=snap_id, |
| 140 | message=message, |
| 141 | committed_at=committed_at, |
| 142 | parent_commit_id=parent_id, |
| 143 | ), |
| 144 | ) |
| 145 | ref = ref_path(repo, branch) |
| 146 | ref.parent.mkdir(parents=True, exist_ok=True) |
| 147 | ref.write_text(commit_id) |
| 148 | return commit_id |
| 149 | |
| 150 | |
| 151 | # --------------------------------------------------------------------------- |
| 152 | # W-1 write_object β canonical path |
| 153 | # --------------------------------------------------------------------------- |
| 154 | |
| 155 | |
| 156 | class TestWriteObject: |
| 157 | """W-1: write_object() places objects at the canonical sha256/ path.""" |
| 158 | |
| 159 | def test_lands_under_sha256_dir(self, tmp_path: pathlib.Path) -> None: |
| 160 | repo = _repo(tmp_path) |
| 161 | oid = blob_id(b"hello") |
| 162 | write_object(repo, oid, b"hello") |
| 163 | p = object_path(repo, oid) |
| 164 | assert p.exists() |
| 165 | assert p.parent.parent.name == "sha256" |
| 166 | |
| 167 | def test_shard_prefix_is_first_two_hex_chars(self, tmp_path: pathlib.Path) -> None: |
| 168 | repo = _repo(tmp_path) |
| 169 | content = b"shard-check" |
| 170 | oid = blob_id(content) |
| 171 | write_object(repo, oid, content) |
| 172 | p = object_path(repo, oid) |
| 173 | hex_id = split_id(oid)[1] |
| 174 | assert p.parent.name == hex_id[:2] |
| 175 | |
| 176 | def test_filename_is_remaining_62_hex_chars(self, tmp_path: pathlib.Path) -> None: |
| 177 | repo = _repo(tmp_path) |
| 178 | content = b"filename-check" |
| 179 | oid = blob_id(content) |
| 180 | write_object(repo, oid, content) |
| 181 | p = object_path(repo, oid) |
| 182 | hex_id = split_id(oid)[1] |
| 183 | assert p.name == hex_id[2:] |
| 184 | |
| 185 | def test_idempotent_returns_false_on_second_write( |
| 186 | self, tmp_path: pathlib.Path |
| 187 | ) -> None: |
| 188 | repo = _repo(tmp_path) |
| 189 | oid = blob_id(b"idempotent") |
| 190 | assert write_object(repo, oid, b"idempotent") is True |
| 191 | assert write_object(repo, oid, b"idempotent") is False |
| 192 | |
| 193 | def test_content_verifiable_after_write(self, tmp_path: pathlib.Path) -> None: |
| 194 | repo = _repo(tmp_path) |
| 195 | content = b"verifiable content" |
| 196 | oid = blob_id(content) |
| 197 | write_object(repo, oid, content) |
| 198 | assert read_object(repo, oid) == content |
| 199 | |
| 200 | def test_rejects_wrong_content(self, tmp_path: pathlib.Path) -> None: |
| 201 | repo = _repo(tmp_path) |
| 202 | oid = blob_id(b"correct") |
| 203 | with pytest.raises(ValueError): |
| 204 | write_object(repo, oid, b"wrong content") |
| 205 | |
| 206 | def test_rejects_bare_hex_object_id(self, tmp_path: pathlib.Path) -> None: |
| 207 | repo = _repo(tmp_path) |
| 208 | bare_hex = split_id(blob_id(b"bare"))[1] |
| 209 | with pytest.raises((ValueError, Exception)): |
| 210 | write_object(repo, bare_hex, b"bare") |
| 211 | |
| 212 | |
| 213 | # --------------------------------------------------------------------------- |
| 214 | # W-2 write_object_from_path β canonical path |
| 215 | # --------------------------------------------------------------------------- |
| 216 | |
| 217 | |
| 218 | class TestWriteObjectFromPath: |
| 219 | """W-2: write_object_from_path() writes from a file and lands at canonical path.""" |
| 220 | |
| 221 | def test_writes_to_sha256_dir(self, tmp_path: pathlib.Path) -> None: |
| 222 | repo = _repo(tmp_path) |
| 223 | src = tmp_path / "source.txt" |
| 224 | content = b"from-path content" |
| 225 | src.write_bytes(content) |
| 226 | oid = blob_id(content) |
| 227 | write_object_from_path(repo, oid, src) |
| 228 | p = object_path(repo, oid) |
| 229 | assert p.exists() |
| 230 | assert p.parent.parent.name == "sha256" |
| 231 | |
| 232 | def test_oid_matches_blob_id(self, tmp_path: pathlib.Path) -> None: |
| 233 | repo = _repo(tmp_path) |
| 234 | content = b"oid must match blob_id" |
| 235 | src = tmp_path / "f.txt" |
| 236 | src.write_bytes(content) |
| 237 | oid = blob_id(content) |
| 238 | write_object_from_path(repo, oid, src) |
| 239 | assert oid == blob_id(content) |
| 240 | |
| 241 | def test_content_readable_after_write(self, tmp_path: pathlib.Path) -> None: |
| 242 | repo = _repo(tmp_path) |
| 243 | content = b"readable after write" |
| 244 | src = tmp_path / "r.txt" |
| 245 | src.write_bytes(content) |
| 246 | oid = blob_id(content) |
| 247 | write_object_from_path(repo, oid, src) |
| 248 | assert read_object(repo, oid) == content |
| 249 | |
| 250 | |
| 251 | # --------------------------------------------------------------------------- |
| 252 | # C-1 β¦ C-7 Consistency invariants |
| 253 | # --------------------------------------------------------------------------- |
| 254 | |
| 255 | |
| 256 | class TestConsistencyInvariants: |
| 257 | """C-1 through C-7: consistency between write, has_object, object_state, iter.""" |
| 258 | |
| 259 | def test_c1_has_object_true_after_write(self, tmp_path: pathlib.Path) -> None: |
| 260 | repo = _repo(tmp_path) |
| 261 | oid = _write_blob(repo, b"c1") |
| 262 | assert has_object(repo, oid) |
| 263 | |
| 264 | def test_c2_object_state_present_after_write(self, tmp_path: pathlib.Path) -> None: |
| 265 | repo = _repo(tmp_path) |
| 266 | oid = _write_blob(repo, b"c2") |
| 267 | state = object_state(repo, oid, []) |
| 268 | assert state == ObjectState.PRESENT |
| 269 | |
| 270 | def test_c3_iter_stored_objects_finds_written( |
| 271 | self, tmp_path: pathlib.Path |
| 272 | ) -> None: |
| 273 | repo = _repo(tmp_path) |
| 274 | oid = _write_blob(repo, b"c3") |
| 275 | found = {o for o, _ in iter_stored_objects(repo)} |
| 276 | assert oid in found |
| 277 | |
| 278 | def test_c4_has_object_and_object_state_agree_present( |
| 279 | self, tmp_path: pathlib.Path |
| 280 | ) -> None: |
| 281 | repo = _repo(tmp_path) |
| 282 | oid = _write_blob(repo, b"c4-present") |
| 283 | assert has_object(repo, oid) |
| 284 | assert object_state(repo, oid, []) == ObjectState.PRESENT |
| 285 | |
| 286 | def test_c4_has_object_and_object_state_agree_absent( |
| 287 | self, tmp_path: pathlib.Path |
| 288 | ) -> None: |
| 289 | repo = _repo(tmp_path) |
| 290 | oid = blob_id(b"never written") |
| 291 | assert not has_object(repo, oid) |
| 292 | assert object_state(repo, oid, []) == ObjectState.MISSING |
| 293 | |
| 294 | def test_c5_object_path_canonical_location(self, tmp_path: pathlib.Path) -> None: |
| 295 | repo = _repo(tmp_path) |
| 296 | content = b"canonical" |
| 297 | oid = blob_id(content) |
| 298 | write_object(repo, oid, content) |
| 299 | p = object_path(repo, oid) |
| 300 | hex_id = split_id(oid)[1] |
| 301 | expected = objects_dir(repo) / "sha256" / hex_id[:2] / hex_id[2:] |
| 302 | assert p == expected |
| 303 | assert p.exists() |
| 304 | |
| 305 | def test_c6_object_state_missing_when_absent_no_promisors( |
| 306 | self, tmp_path: pathlib.Path |
| 307 | ) -> None: |
| 308 | repo = _repo(tmp_path) |
| 309 | oid = blob_id(b"missing") |
| 310 | state = object_state(repo, oid, promisor_remotes=[]) |
| 311 | assert state == ObjectState.MISSING |
| 312 | |
| 313 | def test_c7_object_state_promised_when_absent_with_promisor( |
| 314 | self, tmp_path: pathlib.Path |
| 315 | ) -> None: |
| 316 | repo = _repo(tmp_path) |
| 317 | oid = blob_id(b"promised") |
| 318 | state = object_state(repo, oid, promisor_remotes=["staging"]) |
| 319 | assert state == ObjectState.PROMISED |
| 320 | |
| 321 | def test_c7_object_state_present_beats_promisor( |
| 322 | self, tmp_path: pathlib.Path |
| 323 | ) -> None: |
| 324 | """A present object is PRESENT even when promisors are configured.""" |
| 325 | repo = _repo(tmp_path) |
| 326 | oid = _write_blob(repo, b"present beats promisor") |
| 327 | state = object_state(repo, oid, promisor_remotes=["staging"]) |
| 328 | assert state == ObjectState.PRESENT |
| 329 | |
| 330 | |
| 331 | # --------------------------------------------------------------------------- |
| 332 | # D-1 GC non-full β orphan sweep |
| 333 | # --------------------------------------------------------------------------- |
| 334 | |
| 335 | |
| 336 | class TestGcNonFull: |
| 337 | """D-1: default (non-full) GC sweeps orphans but retains all reachable objects.""" |
| 338 | |
| 339 | def test_orphan_collected(self, tmp_path: pathlib.Path) -> None: |
| 340 | repo = _repo(tmp_path) |
| 341 | oid = _write_blob(repo, b"orphan") |
| 342 | run_gc(repo, grace_period_seconds=0) |
| 343 | assert not object_path(repo, oid).exists() |
| 344 | |
| 345 | def test_reachable_via_snapshot_survives(self, tmp_path: pathlib.Path) -> None: |
| 346 | repo = _repo(tmp_path) |
| 347 | oid = _write_blob(repo, b"reachable") |
| 348 | snap_id = _write_snap(repo, {"f.txt": oid}) |
| 349 | _write_commit_on_branch(repo, snap_id) |
| 350 | run_gc(repo, grace_period_seconds=0) |
| 351 | assert object_path(repo, oid).exists() |
| 352 | |
| 353 | def test_reachable_on_non_default_branch_survives( |
| 354 | self, tmp_path: pathlib.Path |
| 355 | ) -> None: |
| 356 | repo = _repo(tmp_path) |
| 357 | oid = _write_blob(repo, b"non-default branch") |
| 358 | snap_id = _write_snap(repo, {"g.txt": oid}) |
| 359 | _write_commit_on_branch(repo, snap_id, branch="dev") |
| 360 | run_gc(repo, grace_period_seconds=0) |
| 361 | assert object_path(repo, oid).exists() |
| 362 | |
| 363 | def test_multiple_orphans_all_collected(self, tmp_path: pathlib.Path) -> None: |
| 364 | repo = _repo(tmp_path) |
| 365 | oids = [_write_blob(repo, f"o{i}".encode()) for i in range(5)] |
| 366 | result = run_gc(repo, grace_period_seconds=0) |
| 367 | assert result.collected_count == 5 |
| 368 | for oid in oids: |
| 369 | assert not object_path(repo, oid).exists() |
| 370 | |
| 371 | def test_grace_period_protects_recent_objects( |
| 372 | self, tmp_path: pathlib.Path |
| 373 | ) -> None: |
| 374 | repo = _repo(tmp_path) |
| 375 | oid = _write_blob(repo, b"fresh orphan") |
| 376 | result = run_gc(repo, grace_period_seconds=3600) |
| 377 | assert result.collected_count == 0 |
| 378 | assert object_path(repo, oid).exists() |
| 379 | |
| 380 | |
| 381 | # --------------------------------------------------------------------------- |
| 382 | # D-2 GC full β tight reachability |
| 383 | # --------------------------------------------------------------------------- |
| 384 | |
| 385 | |
| 386 | class TestGcFull: |
| 387 | """D-2: gc full mode uses tight reachability but must still retain all live objects.""" |
| 388 | |
| 389 | def test_reachable_object_survives_full_gc(self, tmp_path: pathlib.Path) -> None: |
| 390 | repo = _repo(tmp_path) |
| 391 | oid = _write_blob(repo, b"live object") |
| 392 | snap_id = _write_snap(repo, {"live.txt": oid}) |
| 393 | _write_commit_on_branch(repo, snap_id) |
| 394 | result = run_gc(repo, full=True, grace_period_seconds=0) |
| 395 | assert result.collected_count == 0 |
| 396 | assert object_path(repo, oid).exists() |
| 397 | |
| 398 | def test_orphan_collected_by_full_gc(self, tmp_path: pathlib.Path) -> None: |
| 399 | repo = _repo(tmp_path) |
| 400 | # One reachable, one orphan. |
| 401 | live_oid = _write_blob(repo, b"live") |
| 402 | snap_id = _write_snap(repo, {"f.txt": live_oid}) |
| 403 | _write_commit_on_branch(repo, snap_id) |
| 404 | orphan_oid = _write_blob(repo, b"orphan") |
| 405 | result = run_gc(repo, full=True, grace_period_seconds=0) |
| 406 | assert result.collected_count == 1 |
| 407 | assert not object_path(repo, orphan_oid).exists() |
| 408 | assert object_path(repo, live_oid).exists() |
| 409 | |
| 410 | def test_full_gc_dry_run_does_not_delete(self, tmp_path: pathlib.Path) -> None: |
| 411 | repo = _repo(tmp_path) |
| 412 | oid = _write_blob(repo, b"dry-run orphan") |
| 413 | result = run_gc(repo, full=True, dry_run=True, grace_period_seconds=0) |
| 414 | assert result.dry_run is True |
| 415 | assert object_path(repo, oid).exists() |
| 416 | |
| 417 | |
| 418 | # --------------------------------------------------------------------------- |
| 419 | # D-3 GC full multi-branch β objects on ALL live branches survive |
| 420 | # --------------------------------------------------------------------------- |
| 421 | |
| 422 | |
| 423 | class TestGcFullMultiBranch: |
| 424 | """D-3: full GC must retain objects reachable from every live branch, not just HEAD.""" |
| 425 | |
| 426 | def test_object_on_secondary_branch_survives_full_gc( |
| 427 | self, tmp_path: pathlib.Path |
| 428 | ) -> None: |
| 429 | repo = _repo(tmp_path) |
| 430 | # main branch object |
| 431 | main_oid = _write_blob(repo, b"main content") |
| 432 | main_snap = _write_snap(repo, {"main.txt": main_oid}) |
| 433 | _write_commit_on_branch(repo, main_snap, branch="main") |
| 434 | # dev branch object (different content) |
| 435 | dev_oid = _write_blob(repo, b"dev content") |
| 436 | dev_snap = _write_snap(repo, {"dev.txt": dev_oid}) |
| 437 | _write_commit_on_branch(repo, dev_snap, branch="dev") |
| 438 | result = run_gc(repo, full=True, grace_period_seconds=0) |
| 439 | assert result.collected_count == 0 |
| 440 | assert object_path(repo, main_oid).exists(), "main branch object deleted!" |
| 441 | assert object_path(repo, dev_oid).exists(), "dev branch object deleted!" |
| 442 | |
| 443 | def test_object_on_three_branches_all_survive( |
| 444 | self, tmp_path: pathlib.Path |
| 445 | ) -> None: |
| 446 | repo = _repo(tmp_path) |
| 447 | oids = [] |
| 448 | for branch in ("main", "dev", "feat/x"): |
| 449 | oid = _write_blob(repo, f"content on {branch}".encode()) |
| 450 | snap_id = _write_snap(repo, {f"{branch}.txt": oid}) |
| 451 | _write_commit_on_branch(repo, snap_id, branch=branch) |
| 452 | oids.append(oid) |
| 453 | result = run_gc(repo, full=True, grace_period_seconds=0) |
| 454 | assert result.collected_count == 0 |
| 455 | for oid in oids: |
| 456 | assert object_path(repo, oid).exists() |
| 457 | |
| 458 | def test_shared_object_referenced_by_two_branches_survives( |
| 459 | self, tmp_path: pathlib.Path |
| 460 | ) -> None: |
| 461 | """If main and dev both reference the same object, full GC must keep it.""" |
| 462 | repo = _repo(tmp_path) |
| 463 | shared_oid = _write_blob(repo, b"shared content") |
| 464 | for branch in ("main", "dev"): |
| 465 | snap_id = _write_snap(repo, {"shared.txt": shared_oid}) |
| 466 | _write_commit_on_branch(repo, snap_id, branch=branch) |
| 467 | result = run_gc(repo, full=True, grace_period_seconds=0) |
| 468 | assert result.collected_count == 0 |
| 469 | assert object_path(repo, shared_oid).exists() |
| 470 | |
| 471 | |
| 472 | # --------------------------------------------------------------------------- |
| 473 | # D-4 GC full object ID normalisation |
| 474 | # --------------------------------------------------------------------------- |
| 475 | |
| 476 | |
| 477 | class TestGcFullObjectNormalisation: |
| 478 | """D-4: full GC reachability set uses sha256:-prefixed IDs throughout. |
| 479 | |
| 480 | This is the critical invariant that ensures the reachable-objects set |
| 481 | (built from snapshot manifests) matches the stored-objects set |
| 482 | (built from iter_stored_objects). A mismatch would cause live objects |
| 483 | to be incorrectly classified as unreachable and deleted. |
| 484 | """ |
| 485 | |
| 486 | def test_reachable_set_uses_prefixed_ids(self, tmp_path: pathlib.Path) -> None: |
| 487 | """_collect_reachable_snapshots returns sha256:-prefixed object IDs.""" |
| 488 | repo = _repo(tmp_path) |
| 489 | oid = _write_blob(repo, b"normalisation check") |
| 490 | snap_id = _write_snap(repo, {"f.txt": oid}) |
| 491 | _write_commit_on_branch(repo, snap_id) |
| 492 | reachable_commits = _collect_reachable_commits(repo) |
| 493 | _, reachable_objs = _collect_reachable_snapshots(repo, reachable_commits) |
| 494 | # Every entry must carry the sha256: prefix. |
| 495 | for obj_id in reachable_objs: |
| 496 | assert obj_id.startswith("sha256:"), ( |
| 497 | f"Reachable object ID missing sha256: prefix: {obj_id!r}" |
| 498 | ) |
| 499 | |
| 500 | def test_iter_stored_objects_uses_prefixed_ids( |
| 501 | self, tmp_path: pathlib.Path |
| 502 | ) -> None: |
| 503 | """iter_stored_objects returns sha256:-prefixed object IDs.""" |
| 504 | repo = _repo(tmp_path) |
| 505 | _write_blob(repo, b"stored check") |
| 506 | for oid, _ in iter_stored_objects(repo): |
| 507 | assert oid.startswith("sha256:"), ( |
| 508 | f"iter_stored_objects returned unprefixed ID: {oid!r}" |
| 509 | ) |
| 510 | |
| 511 | def test_reachable_set_matches_stored_set_for_live_objects( |
| 512 | self, tmp_path: pathlib.Path |
| 513 | ) -> None: |
| 514 | """Every live object must appear in both sets with the same ID form.""" |
| 515 | repo = _repo(tmp_path) |
| 516 | oids = set() |
| 517 | for i in range(3): |
| 518 | oid = _write_blob(repo, f"live {i}".encode()) |
| 519 | oids.add(oid) |
| 520 | snap_id = _write_snap(repo, {f"f{i}.txt": o for i, o in enumerate(oids)}) |
| 521 | _write_commit_on_branch(repo, snap_id) |
| 522 | reachable_commits = _collect_reachable_commits(repo) |
| 523 | _, reachable_objs = _collect_reachable_snapshots(repo, reachable_commits) |
| 524 | stored = {o for o, _ in iter_stored_objects(repo)} |
| 525 | # All live objects must be in both sets. |
| 526 | for oid in oids: |
| 527 | assert oid in reachable_objs, f"{oid} missing from reachable set" |
| 528 | assert oid in stored, f"{oid} missing from stored set" |
| 529 | |
| 530 | def test_full_gc_does_not_delete_prefixed_manifest_objects( |
| 531 | self, tmp_path: pathlib.Path |
| 532 | ) -> None: |
| 533 | """Regression: full GC must not delete objects whose IDs use sha256: prefix in the manifest.""" |
| 534 | repo = _repo(tmp_path) |
| 535 | contents = [f"file {i} content".encode() for i in range(5)] |
| 536 | manifest = {} |
| 537 | for i, c in enumerate(contents): |
| 538 | oid = _write_blob(repo, c) |
| 539 | manifest[f"file{i}.py"] = oid |
| 540 | # Confirm the manifest value is prefixed. |
| 541 | assert oid.startswith("sha256:"), f"blob_id returned unprefixed: {oid}" |
| 542 | snap_id = _write_snap(repo, manifest) |
| 543 | _write_commit_on_branch(repo, snap_id) |
| 544 | result = run_gc(repo, full=True, grace_period_seconds=0) |
| 545 | assert result.collected_count == 0, ( |
| 546 | f"Full GC deleted {result.collected_count} live objects: {result.collected_ids}" |
| 547 | ) |
| 548 | for oid in manifest.values(): |
| 549 | assert object_path(repo, oid).exists(), f"Full GC deleted live object {oid}" |
| 550 | |
| 551 | def test_full_gc_retains_large_manifest(self, tmp_path: pathlib.Path) -> None: |
| 552 | """Full GC must not delete any of N live objects in a large snapshot.""" |
| 553 | repo = _repo(tmp_path) |
| 554 | n = 50 |
| 555 | manifest = {} |
| 556 | for i in range(n): |
| 557 | oid = _write_blob(repo, f"large manifest entry {i}".encode()) |
| 558 | manifest[f"src/file_{i:03d}.py"] = oid |
| 559 | snap_id = _write_snap(repo, manifest) |
| 560 | _write_commit_on_branch(repo, snap_id) |
| 561 | result = run_gc(repo, full=True, grace_period_seconds=0) |
| 562 | assert result.collected_count == 0, ( |
| 563 | f"Full GC deleted objects from large manifest: {result.collected_ids[:5]}" |
| 564 | ) |
| 565 | |
| 566 | |
| 567 | # --------------------------------------------------------------------------- |
| 568 | # D-5 Prune β mirrors non-full GC with expire window |
| 569 | # --------------------------------------------------------------------------- |
| 570 | |
| 571 | |
| 572 | class TestPruneSafety: |
| 573 | """D-5: muse prune must never delete reachable objects.""" |
| 574 | |
| 575 | def test_prune_does_not_remove_committed_object( |
| 576 | self, tmp_path: pathlib.Path |
| 577 | ) -> None: |
| 578 | """Objects referenced by commits must survive prune.""" |
| 579 | from muse.core.gc import run_gc # prune delegates to gc |
| 580 | repo = _repo(tmp_path) |
| 581 | oid = _write_blob(repo, b"committed object") |
| 582 | snap_id = _write_snap(repo, {"f.txt": oid}) |
| 583 | _write_commit_on_branch(repo, snap_id) |
| 584 | # Non-full GC is what prune uses. |
| 585 | result = run_gc(repo, grace_period_seconds=0) |
| 586 | assert result.collected_count == 0 |
| 587 | assert object_path(repo, oid).exists() |
| 588 | |
| 589 | |
| 590 | # --------------------------------------------------------------------------- |
| 591 | # D-6 Maintenance gc task passes full=True |
| 592 | # --------------------------------------------------------------------------- |
| 593 | |
| 594 | |
| 595 | class TestMaintenanceGcUsesFull: |
| 596 | """D-6: the maintenance 'gc' task must invoke run_gc with full=True.""" |
| 597 | |
| 598 | def test_maintenance_gc_task_calls_run_gc_with_full_true( |
| 599 | self, tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch |
| 600 | ) -> None: |
| 601 | """Confirm _run_gc (the maintenance task) passes full=True to run_gc.""" |
| 602 | from muse.cli.commands import maintenance as maint_mod |
| 603 | |
| 604 | calls: list[dict] = [] |
| 605 | |
| 606 | def _capture_run_gc(root: pathlib.Path, *, dry_run: bool, grace_period_seconds: float, full: bool) -> "GcResult": |
| 607 | calls.append({"full": full, "dry_run": dry_run}) |
| 608 | from muse.core.gc import GcResult |
| 609 | return GcResult(dry_run=dry_run, grace_period_seconds=grace_period_seconds, full=full) |
| 610 | |
| 611 | monkeypatch.setattr(maint_mod, "run_gc", _capture_run_gc) |
| 612 | repo = _repo(tmp_path) |
| 613 | maint_mod._run_gc(repo) |
| 614 | assert calls, "run_gc was never called by maintenance _run_gc" |
| 615 | assert calls[0]["full"] is True, ( |
| 616 | f"Maintenance gc must pass full=True, got full={calls[0]['full']}" |
| 617 | ) |
| 618 | |
| 619 | def test_maintenance_gc_retains_all_reachable_objects( |
| 620 | self, tmp_path: pathlib.Path |
| 621 | ) -> None: |
| 622 | """End-to-end: running the maintenance gc task must not delete live objects.""" |
| 623 | from muse.cli.commands.maintenance import _run_gc as maintenance_run_gc |
| 624 | |
| 625 | repo = _repo(tmp_path) |
| 626 | # Write objects on two branches. |
| 627 | for branch, content in (("main", b"main obj"), ("dev", b"dev obj")): |
| 628 | oid = _write_blob(repo, content) |
| 629 | snap_id = _write_snap(repo, {f"{branch}.py": oid}) |
| 630 | _write_commit_on_branch(repo, snap_id, branch=branch) |
| 631 | |
| 632 | maintenance_run_gc(repo, dry_run=False) |
| 633 | |
| 634 | # Both objects must survive. |
| 635 | for content in (b"main obj", b"dev obj"): |
| 636 | oid = blob_id(content) |
| 637 | assert object_path(repo, oid).exists(), ( |
| 638 | f"Maintenance gc deleted live object {oid}" |
| 639 | ) |
| 640 | |
| 641 | |
| 642 | # --------------------------------------------------------------------------- |
| 643 | # W-3 Commit workflow β objects written before commit record |
| 644 | # --------------------------------------------------------------------------- |
| 645 | |
| 646 | |
| 647 | class TestCommitWritePath: |
| 648 | """W-3: the commit workflow must write blobs to the object store at the |
| 649 | canonical path before creating the commit record. |
| 650 | |
| 651 | We test this at the store level (not the CLI) since the CLI requires a |
| 652 | full working-tree environment. |
| 653 | """ |
| 654 | |
| 655 | def test_snapshot_manifest_objects_at_canonical_path( |
| 656 | self, tmp_path: pathlib.Path |
| 657 | ) -> None: |
| 658 | """Objects written for a commit land at the canonical sha256/ path.""" |
| 659 | repo = _repo(tmp_path) |
| 660 | contents = {f"src/file{i}.py": f"content {i}".encode() for i in range(3)} |
| 661 | manifest = {} |
| 662 | for path, content in contents.items(): |
| 663 | oid = blob_id(content) |
| 664 | write_object(repo, oid, content) |
| 665 | manifest[path] = oid |
| 666 | snap_id = _write_snap(repo, manifest) |
| 667 | _write_commit_on_branch(repo, snap_id) |
| 668 | # All objects reachable and at correct path. |
| 669 | for oid in manifest.values(): |
| 670 | p = object_path(repo, oid) |
| 671 | assert p.exists() |
| 672 | assert p.parent.parent.name == "sha256" |
| 673 | |
| 674 | def test_all_manifest_objects_survive_full_gc( |
| 675 | self, tmp_path: pathlib.Path |
| 676 | ) -> None: |
| 677 | """Objects in a committed snapshot must all survive full GC.""" |
| 678 | repo = _repo(tmp_path) |
| 679 | manifest = {} |
| 680 | for i in range(10): |
| 681 | content = f"committed file {i}".encode() |
| 682 | oid = blob_id(content) |
| 683 | write_object(repo, oid, content) |
| 684 | manifest[f"file{i}.py"] = oid |
| 685 | snap_id = _write_snap(repo, manifest) |
| 686 | _write_commit_on_branch(repo, snap_id) |
| 687 | result = run_gc(repo, full=True, grace_period_seconds=0) |
| 688 | assert result.collected_count == 0 |
| 689 | for oid in manifest.values(): |
| 690 | assert object_path(repo, oid).exists() |
| 691 | |
| 692 | |
| 693 | # --------------------------------------------------------------------------- |
| 694 | # W-4 Shelf save β blobs written before shelf entry |
| 695 | # --------------------------------------------------------------------------- |
| 696 | |
| 697 | |
| 698 | class TestShelfWritePath: |
| 699 | """W-4: shelf objects must survive GC even before they are committed.""" |
| 700 | |
| 701 | def test_shelved_objects_survive_non_full_gc( |
| 702 | self, tmp_path: pathlib.Path |
| 703 | ) -> None: |
| 704 | repo = _repo(tmp_path) |
| 705 | shelf_oid = _write_blob(repo, b"shelved work") |
| 706 | _write_shelf_entry(repo, {"work.py": shelf_oid}) |
| 707 | result = run_gc(repo, grace_period_seconds=0) |
| 708 | assert result.collected_count == 0 |
| 709 | assert object_path(repo, shelf_oid).exists() |
| 710 | |
| 711 | def test_shelved_objects_survive_full_gc(self, tmp_path: pathlib.Path) -> None: |
| 712 | repo = _repo(tmp_path) |
| 713 | shelf_oid = _write_blob(repo, b"shelved full gc") |
| 714 | _write_shelf_entry(repo, {"wip.py": shelf_oid}) |
| 715 | result = run_gc(repo, full=True, grace_period_seconds=0) |
| 716 | assert result.collected_count == 0 |
| 717 | assert object_path(repo, shelf_oid).exists() |