test_cmd_gc_hardening.py
python
sha256:2eaa5d95f9d9383498e76947410a26e5a3ba23d182f339910c424cf88fad412b
fix: try fetch/presign before fetch/mpack to avoid Cloudfla…
Sonnet 4.6
patch
7 days ago
| 1 | """Comprehensive hardening tests for ``muse gc``. |
| 2 | |
| 3 | Coverage dimensions: |
| 4 | |
| 5 | Unit |
| 6 | ~~~~ |
| 7 | - ``_is_hex`` edge cases (empty string, uppercase, mixed, valid) |
| 8 | - ``_list_stored_objects`` symlink guard for prefix dirs |
| 9 | - ``_list_stored_objects`` symlink guard for object files |
| 10 | - ``_list_stored_objects`` grace period filters recent files |
| 11 | - ``_list_stored_objects`` grace_period=0 includes all files |
| 12 | - ``_collect_reachable_objects`` symlink guard on shelf.json |
| 13 | - ``_collect_reachable_objects`` size cap on shelf.json |
| 14 | - ``_collect_reachable_objects`` malformed shelf.json is skipped gracefully |
| 15 | - ``run_gc`` grace_period_seconds stored in GcResult |
| 16 | - ``_fmt_bytes`` all size ranges |
| 17 | - ``run_gc`` negative grace period rejected by CLI |
| 18 | |
| 19 | Security |
| 20 | ~~~~~~~~ |
| 21 | - Symlink in .muse/objects/ prefix dir not deleted or followed |
| 22 | - Symlink object file not deleted or followed |
| 23 | - Symlink shelf.json skipped during reachability walk |
| 24 | - ANSI escape sequences in object IDs sanitized in text output |
| 25 | - Invalid --format rejected with error to stderr |
| 26 | - Negative --grace-period rejected with non-zero exit |
| 27 | |
| 28 | Integration (CLI) |
| 29 | ~~~~~~~~~~~~~~~~~ |
| 30 | - ``--json`` output schema matches ``_GcJson`` TypedDict |
| 31 | - ``--json`` includes ``grace_period_seconds`` field |
| 32 | - ``--grace-period`` value propagated to GcResult |
| 33 | - ``--dry-run`` combined with ``--json`` reports correctly |
| 34 | - ``--verbose`` combined with ``--json`` shows IDs in JSON |
| 35 | - ``--format text`` is the default |
| 36 | - Repeated GC runs are idempotent (JSON) |
| 37 | |
| 38 | E2E |
| 39 | ~~~ |
| 40 | - Full lifecycle: orphan accumulates across branches, GC reclaims |
| 41 | - GC after shelf save does NOT delete shelved objects |
| 42 | - GC with corrupt shelf.json succeeds (skips shelf walk) |
| 43 | - ``--grace-period 0`` collects freshly-written orphan |
| 44 | - ``--grace-period 9999`` protects freshly-written orphan |
| 45 | |
| 46 | Stress |
| 47 | ~~~~~~ |
| 48 | - 500 orphaned objects across 256 prefix dirs collected correctly |
| 49 | - Concurrent read-only GC (dry-run) on same repo is safe |
| 50 | """ |
| 51 | |
| 52 | from __future__ import annotations |
| 53 | |
| 54 | import json |
| 55 | import os |
| 56 | import pathlib |
| 57 | import stat |
| 58 | import threading |
| 59 | import time |
| 60 | from collections.abc import Mapping |
| 61 | from typing import TypedDict |
| 62 | |
| 63 | |
| 64 | import pytest |
| 65 | from tests.cli_test_helper import CliRunner, InvokeResult |
| 66 | from muse.core.types import fake_id, long_id |
| 67 | from muse.core.object_store import object_path |
| 68 | from muse.core.paths import heads_dir, merge_state_path, muse_dir, objects_dir, shelf_dir |
| 69 | |
| 70 | cli = None # argparse bridge — CliRunner ignores this |
| 71 | runner = CliRunner() |
| 72 | |
| 73 | |
| 74 | # --------------------------------------------------------------------------- |
| 75 | # Helpers |
| 76 | # --------------------------------------------------------------------------- |
| 77 | |
| 78 | |
| 79 | def _env(root: pathlib.Path) -> Manifest: |
| 80 | return {"MUSE_REPO_ROOT": str(root)} |
| 81 | |
| 82 | |
| 83 | def _make_repo(tmp_path: pathlib.Path) -> pathlib.Path: |
| 84 | muse = muse_dir(tmp_path) |
| 85 | for sub in ("objects", "commits", "snapshots", "refs/heads"): |
| 86 | (muse / sub).mkdir(parents=True, exist_ok=True) |
| 87 | repo_id = fake_id("repo") |
| 88 | (muse / "repo.json").write_text(json.dumps({ |
| 89 | "repo_id": repo_id, |
| 90 | "domain": "code", |
| 91 | "default_branch": "main", |
| 92 | "created_at": "2026-01-01T00:00:00+00:00", |
| 93 | }), encoding="utf-8") |
| 94 | (muse / "HEAD").write_text("ref: refs/heads/main\n", encoding="utf-8") |
| 95 | return tmp_path |
| 96 | |
| 97 | |
| 98 | def _write_object(root: pathlib.Path, content: bytes) -> str: |
| 99 | from muse.core.types import blob_id |
| 100 | from muse.core.object_store import write_object |
| 101 | oid = blob_id(content) |
| 102 | write_object(root, oid, content) |
| 103 | return oid |
| 104 | |
| 105 | |
| 106 | class _ShelfEntryData(TypedDict): |
| 107 | snapshot: dict[str, str] |
| 108 | branch: str |
| 109 | created_at: str |
| 110 | |
| 111 | |
| 112 | def _write_shelf_entry(root: pathlib.Path, snapshot: Mapping[str, str]) -> pathlib.Path: |
| 113 | """Write a shelf entry in git-header+JSON format under .muse/shelf/sha256/.""" |
| 114 | import json as _json |
| 115 | from muse.core.types import blob_id, split_id |
| 116 | from muse.core.shelf import write_shelf_entry |
| 117 | entry_data: _ShelfEntryData = { |
| 118 | "snapshot": dict(snapshot), |
| 119 | "branch": "main", |
| 120 | "created_at": "2026-01-01T00:00:00+00:00", |
| 121 | } |
| 122 | raw_bytes = _json.dumps(entry_data, sort_keys=True).encode() |
| 123 | _, hex_id = split_id(blob_id(raw_bytes)) |
| 124 | entry_id = f"sha256:{hex_id}" |
| 125 | entry_data["id"] = entry_id |
| 126 | write_shelf_entry(root, entry_data) |
| 127 | return shelf_dir(root) / "sha256" / hex_id |
| 128 | |
| 129 | |
| 130 | def _make_commit(root: pathlib.Path, manifest: Manifest | None = None) -> str: |
| 131 | import datetime |
| 132 | from muse.core.ids import hash_commit, hash_snapshot |
| 133 | from muse.core.commits import ( |
| 134 | CommitRecord, |
| 135 | write_commit, |
| 136 | ) |
| 137 | from muse.core.snapshots import ( |
| 138 | SnapshotRecord, |
| 139 | write_snapshot, |
| 140 | ) |
| 141 | |
| 142 | mfst: Manifest = manifest or {} |
| 143 | snap_id = hash_snapshot(mfst) |
| 144 | committed_at = datetime.datetime(2026, 1, 1, tzinfo=datetime.timezone.utc) |
| 145 | commit_id = hash_commit( parent_ids=[], |
| 146 | snapshot_id=snap_id, |
| 147 | message="test", |
| 148 | committed_at_iso=committed_at.isoformat(), |
| 149 | ) |
| 150 | write_snapshot(root, SnapshotRecord(snapshot_id=snap_id, manifest=mfst)) |
| 151 | write_commit(root, CommitRecord( |
| 152 | commit_id=commit_id, |
| 153 | branch="main", |
| 154 | snapshot_id=snap_id, |
| 155 | message="test", |
| 156 | committed_at=committed_at, |
| 157 | )) |
| 158 | ref_path = heads_dir(root) / "main" |
| 159 | ref_path.parent.mkdir(parents=True, exist_ok=True) |
| 160 | ref_path.write_text(commit_id, encoding="utf-8") |
| 161 | return commit_id |
| 162 | |
| 163 | |
| 164 | def _invoke_gc(root: pathlib.Path, *extra_args: str) -> InvokeResult: |
| 165 | """Invoke ``muse gc`` with ``--grace-period 0`` unless caller overrides.""" |
| 166 | args = list(extra_args) |
| 167 | if "--grace-period" not in args: |
| 168 | args = ["--grace-period", "0"] + args |
| 169 | return runner.invoke(cli, ["gc"] + args, env=_env(root), catch_exceptions=False) |
| 170 | |
| 171 | |
| 172 | # --------------------------------------------------------------------------- |
| 173 | # _GcJson TypedDict for test assertions |
| 174 | # --------------------------------------------------------------------------- |
| 175 | |
| 176 | |
| 177 | class _GcJson(TypedDict): |
| 178 | collected_count: int |
| 179 | collected_bytes: int |
| 180 | reachable_count: int |
| 181 | duration_ms: float |
| 182 | grace_period_seconds: int |
| 183 | dry_run: bool |
| 184 | collected_ids: list[str] |
| 185 | |
| 186 | |
| 187 | def _parse_gc_json(output: str) -> _GcJson: |
| 188 | """Extract and parse the JSON blob from CliRunner output.""" |
| 189 | for line in output.splitlines(): |
| 190 | line = line.strip() |
| 191 | if line.startswith("{"): |
| 192 | raw = json.loads(line) |
| 193 | return _GcJson( |
| 194 | collected_count=int(raw["collected_count"]), |
| 195 | collected_bytes=int(raw["collected_bytes"]), |
| 196 | reachable_count=int(raw["reachable_count"]), |
| 197 | duration_ms=float(raw["duration_ms"]), |
| 198 | grace_period_seconds=int(raw["grace_period_seconds"]), |
| 199 | dry_run=bool(raw["dry_run"]), |
| 200 | collected_ids=[str(x) for x in raw["collected_ids"]], |
| 201 | ) |
| 202 | raise AssertionError(f"No JSON object found in output:\n{output}") |
| 203 | |
| 204 | |
| 205 | # --------------------------------------------------------------------------- |
| 206 | # Unit — _is_hex |
| 207 | # --------------------------------------------------------------------------- |
| 208 | |
| 209 | |
| 210 | class TestIsHex: |
| 211 | def test_empty_string_is_not_hex(self) -> None: |
| 212 | from muse.core.gc import _is_hex |
| 213 | assert not _is_hex("") |
| 214 | |
| 215 | def test_valid_lowercase_hex(self) -> None: |
| 216 | from muse.core.gc import _is_hex |
| 217 | assert _is_hex("0123456789abcdef") |
| 218 | |
| 219 | def test_uppercase_rejected(self) -> None: |
| 220 | from muse.core.gc import _is_hex |
| 221 | assert not _is_hex("ABCDEF") |
| 222 | |
| 223 | def test_mixed_case_rejected(self) -> None: |
| 224 | from muse.core.gc import _is_hex |
| 225 | assert not _is_hex("0aF") |
| 226 | |
| 227 | def test_non_hex_chars_rejected(self) -> None: |
| 228 | from muse.core.gc import _is_hex |
| 229 | assert not _is_hex("xyz") |
| 230 | |
| 231 | def test_single_valid_char(self) -> None: |
| 232 | from muse.core.gc import _is_hex |
| 233 | assert _is_hex("a") |
| 234 | |
| 235 | def test_64_char_sha256(self) -> None: |
| 236 | from muse.core.gc import _is_hex |
| 237 | sha = "a" * 64 |
| 238 | assert _is_hex(sha) |
| 239 | |
| 240 | |
| 241 | # --------------------------------------------------------------------------- |
| 242 | # Unit — _fmt_bytes |
| 243 | # --------------------------------------------------------------------------- |
| 244 | |
| 245 | |
| 246 | class TestFmtBytes: |
| 247 | def test_bytes_range(self) -> None: |
| 248 | from muse.cli.commands.gc import _fmt_bytes |
| 249 | assert _fmt_bytes(0) == "0 B" |
| 250 | assert _fmt_bytes(1023) == "1023 B" |
| 251 | |
| 252 | def test_kib_range(self) -> None: |
| 253 | from muse.cli.commands.gc import _fmt_bytes |
| 254 | assert "KiB" in _fmt_bytes(1024) |
| 255 | assert "KiB" in _fmt_bytes(1024 * 1024 - 1) |
| 256 | |
| 257 | def test_mib_range(self) -> None: |
| 258 | from muse.cli.commands.gc import _fmt_bytes |
| 259 | assert "MiB" in _fmt_bytes(1024 * 1024) |
| 260 | assert "MiB" in _fmt_bytes(1024 * 1024 * 100) |
| 261 | |
| 262 | |
| 263 | # --------------------------------------------------------------------------- |
| 264 | # Unit — _list_stored_objects |
| 265 | # --------------------------------------------------------------------------- |
| 266 | |
| 267 | |
| 268 | class TestListStoredObjects: |
| 269 | def test_symlink_prefix_dir_is_skipped(self, tmp_path: pathlib.Path) -> None: |
| 270 | """A symlinked prefix directory must not be entered.""" |
| 271 | from muse.core.gc import _list_stored_objects |
| 272 | root = _make_repo(tmp_path) |
| 273 | real_dir = tmp_path / "external_objects" |
| 274 | real_dir.mkdir() |
| 275 | sha = "a" * 64 |
| 276 | real_file = real_dir / sha[2:] |
| 277 | real_file.write_bytes(b"content") |
| 278 | |
| 279 | # Create a symlink at .muse/objects/sha256/<prefix> → external dir |
| 280 | algo_dir = objects_dir(root) / "sha256" |
| 281 | algo_dir.mkdir(parents=True, exist_ok=True) |
| 282 | link = algo_dir / sha[:2] |
| 283 | link.symlink_to(real_dir) |
| 284 | |
| 285 | pairs = _list_stored_objects(root, grace_period_seconds=0) |
| 286 | found_ids = {oid for oid, _ in pairs} |
| 287 | assert sha not in found_ids, "Symlinked prefix dir must not be entered" |
| 288 | |
| 289 | def test_symlink_object_file_is_skipped(self, tmp_path: pathlib.Path) -> None: |
| 290 | """A symlinked object file must not be listed or ever unlinked.""" |
| 291 | from muse.core.gc import _list_stored_objects |
| 292 | root = _make_repo(tmp_path) |
| 293 | # Write a real file outside the repo. |
| 294 | external = tmp_path / "external_secret" |
| 295 | external.write_bytes(b"secret content") |
| 296 | |
| 297 | sha = "b" * 64 |
| 298 | link = object_path(root, long_id(sha)) |
| 299 | link.parent.mkdir(parents=True, exist_ok=True) |
| 300 | link.symlink_to(external) |
| 301 | |
| 302 | pairs = _list_stored_objects(root, grace_period_seconds=0) |
| 303 | found_ids = {oid for oid, _ in pairs} |
| 304 | assert sha not in found_ids, "Symlinked object file must not be listed" |
| 305 | # The external file must be untouched. |
| 306 | assert external.exists() |
| 307 | |
| 308 | def test_grace_period_filters_recent_files(self, tmp_path: pathlib.Path) -> None: |
| 309 | """Objects written within the grace window are excluded.""" |
| 310 | from muse.core.gc import _list_stored_objects |
| 311 | root = _make_repo(tmp_path) |
| 312 | _write_object(root, b"new orphan") |
| 313 | # Grace period of 60 s — the object was written <1 s ago. |
| 314 | pairs = _list_stored_objects(root, grace_period_seconds=60) |
| 315 | assert len(pairs) == 0 |
| 316 | |
| 317 | def test_grace_period_zero_includes_all_files(self, tmp_path: pathlib.Path) -> None: |
| 318 | """grace_period_seconds=0 bypasses the mtime check.""" |
| 319 | from muse.core.gc import _list_stored_objects |
| 320 | root = _make_repo(tmp_path) |
| 321 | _write_object(root, b"orphan") |
| 322 | pairs = _list_stored_objects(root, grace_period_seconds=0) |
| 323 | assert len(pairs) == 1 |
| 324 | |
| 325 | def test_non_hex_prefix_dir_skipped(self, tmp_path: pathlib.Path) -> None: |
| 326 | from muse.core.gc import _list_stored_objects |
| 327 | root = _make_repo(tmp_path) |
| 328 | (objects_dir(root) / "sha256" / "zz").mkdir(parents=True) |
| 329 | pairs = _list_stored_objects(root, grace_period_seconds=0) |
| 330 | assert len(pairs) == 0 |
| 331 | |
| 332 | def test_non_hex_object_file_skipped(self, tmp_path: pathlib.Path) -> None: |
| 333 | from muse.core.gc import _list_stored_objects |
| 334 | root = _make_repo(tmp_path) |
| 335 | prefix = objects_dir(root) / "sha256" / "ab" |
| 336 | prefix.mkdir(parents=True) |
| 337 | (prefix / "not-valid-hex!").write_bytes(b"x") |
| 338 | pairs = _list_stored_objects(root, grace_period_seconds=0) |
| 339 | assert len(pairs) == 0 |
| 340 | |
| 341 | def test_valid_object_included(self, tmp_path: pathlib.Path) -> None: |
| 342 | from muse.core.gc import _list_stored_objects |
| 343 | root = _make_repo(tmp_path) |
| 344 | oid = _write_object(root, b"valid object") |
| 345 | pairs = _list_stored_objects(root, grace_period_seconds=0) |
| 346 | found_ids = {o for o, _ in pairs} |
| 347 | assert oid in found_ids |
| 348 | |
| 349 | |
| 350 | # --------------------------------------------------------------------------- |
| 351 | # Unit — _collect_reachable_objects |
| 352 | # --------------------------------------------------------------------------- |
| 353 | |
| 354 | |
| 355 | class TestCollectReachableObjects: |
| 356 | def test_shelf_symlink_skipped(self, tmp_path: pathlib.Path) -> None: |
| 357 | """A symlinked shelf.json is ignored during the reachability walk.""" |
| 358 | from muse.core.gc import _collect_reachable_objects |
| 359 | root = _make_repo(tmp_path) |
| 360 | # Write a real object and make it look shelved via a symlink. |
| 361 | obj_id = _write_object(root, b"shelved content") |
| 362 | external = tmp_path / "real_shelf.json" |
| 363 | external.write_text(json.dumps([{ |
| 364 | "snapshot_id": "s" * 64, |
| 365 | "branch": "main", |
| 366 | "created_at": "2026-01-01T00:00:00+00:00", |
| 367 | "snapshot": {"a.py": obj_id}, |
| 368 | }])) |
| 369 | link = muse_dir(root) / "shelf.json" |
| 370 | link.symlink_to(external) |
| 371 | |
| 372 | reachable = _collect_reachable_objects(root) |
| 373 | # The object should NOT be marked reachable (symlink was skipped). |
| 374 | assert obj_id not in reachable |
| 375 | |
| 376 | def test_shelf_oversized_file_skipped(self, tmp_path: pathlib.Path) -> None: |
| 377 | """A shelf entry exceeding the size cap is skipped, not OOM-killed.""" |
| 378 | from muse.core.gc import _collect_reachable_objects, _MAX_SHELF_BYTES |
| 379 | import unittest.mock as mock |
| 380 | root = _make_repo(tmp_path) |
| 381 | obj_id = _write_object(root, b"shelved content") |
| 382 | entry_path = _write_shelf_entry(root, {"a.py": obj_id}) |
| 383 | fake_stat = os.stat_result(( |
| 384 | stat.S_IFREG | 0o644, 0, 0, 1, 0, 0, |
| 385 | _MAX_SHELF_BYTES + 1, 0, 0, 0, |
| 386 | )) |
| 387 | with mock.patch.object(pathlib.Path, "stat", return_value=fake_stat): |
| 388 | reachable = _collect_reachable_objects(root) |
| 389 | assert obj_id not in reachable |
| 390 | |
| 391 | def test_malformed_shelf_json_skipped(self, tmp_path: pathlib.Path) -> None: |
| 392 | from muse.core.gc import _collect_reachable_objects |
| 393 | root = _make_repo(tmp_path) |
| 394 | (muse_dir(root) / "shelf.json").write_text("not valid json{{{}}", encoding="utf-8") |
| 395 | # Should not raise. |
| 396 | reachable = _collect_reachable_objects(root) |
| 397 | assert isinstance(reachable, set) |
| 398 | |
| 399 | def test_valid_shelf_objects_marked_reachable(self, tmp_path: pathlib.Path) -> None: |
| 400 | from muse.core.gc import _collect_reachable_objects |
| 401 | root = _make_repo(tmp_path) |
| 402 | obj_id = _write_object(root, b"shelved content") |
| 403 | _write_shelf_entry(root, {"a.py": obj_id}) |
| 404 | reachable = _collect_reachable_objects(root) |
| 405 | assert obj_id in reachable |
| 406 | |
| 407 | |
| 408 | # --------------------------------------------------------------------------- |
| 409 | # Unit — run_gc result fields |
| 410 | # --------------------------------------------------------------------------- |
| 411 | |
| 412 | |
| 413 | class TestRunGcResult: |
| 414 | def test_grace_period_stored_in_result(self, tmp_path: pathlib.Path) -> None: |
| 415 | from muse.core.gc import run_gc |
| 416 | root = _make_repo(tmp_path) |
| 417 | result = run_gc(root, grace_period_seconds=42) |
| 418 | assert result.grace_period_seconds == 42 |
| 419 | |
| 420 | def test_dry_run_flag_stored_in_result(self, tmp_path: pathlib.Path) -> None: |
| 421 | from muse.core.gc import run_gc |
| 422 | root = _make_repo(tmp_path) |
| 423 | result = run_gc(root, dry_run=True, grace_period_seconds=0) |
| 424 | assert result.dry_run is True |
| 425 | |
| 426 | def test_duration_ms_is_non_negative(self, tmp_path: pathlib.Path) -> None: |
| 427 | from muse.core.gc import run_gc |
| 428 | root = _make_repo(tmp_path) |
| 429 | result = run_gc(root, grace_period_seconds=0) |
| 430 | assert result.duration_ms >= 0.0 |
| 431 | |
| 432 | |
| 433 | # --------------------------------------------------------------------------- |
| 434 | # Security — CLI |
| 435 | # --------------------------------------------------------------------------- |
| 436 | |
| 437 | |
| 438 | class TestSecurity: |
| 439 | def test_symlink_in_objects_not_deleted(self, tmp_path: pathlib.Path) -> None: |
| 440 | """GC must never delete a file outside the repo via a symlink.""" |
| 441 | root = _make_repo(tmp_path) |
| 442 | _make_commit(root) |
| 443 | external = tmp_path / "precious_file" |
| 444 | external.write_bytes(b"important data") |
| 445 | sha = "c" * 64 |
| 446 | link = object_path(root, long_id(sha)) |
| 447 | link.parent.mkdir(parents=True, exist_ok=True) |
| 448 | link.symlink_to(external) |
| 449 | |
| 450 | _invoke_gc(root) |
| 451 | |
| 452 | assert external.exists(), "External file must not be deleted via symlink" |
| 453 | |
| 454 | def test_ansi_in_object_id_sanitized(self, tmp_path: pathlib.Path) -> None: |
| 455 | """sanitize_display must strip ANSI sequences from object IDs in verbose output.""" |
| 456 | root = _make_repo(tmp_path) |
| 457 | _make_commit(root) |
| 458 | # Write a real orphan (we can't control its SHA, but we test the path is taken). |
| 459 | _write_object(root, b"orphan for sanitize test") |
| 460 | result = _invoke_gc(root, "--verbose") |
| 461 | assert result.exit_code == 0 |
| 462 | # The output must not contain raw ESC bytes. |
| 463 | assert "\x1b" not in result.output |
| 464 | |
| 465 | def test_invalid_format_exits_nonzero_and_writes_stderr( |
| 466 | self, tmp_path: pathlib.Path |
| 467 | ) -> None: |
| 468 | root = _make_repo(tmp_path) |
| 469 | # argparse now uses choices= so invalid format triggers argparse error. |
| 470 | result = runner.invoke(cli, ["gc", "--format", "csv"], env=_env(root)) |
| 471 | assert result.exit_code != 0 |
| 472 | |
| 473 | def test_negative_grace_period_rejected(self, tmp_path: pathlib.Path) -> None: |
| 474 | root = _make_repo(tmp_path) |
| 475 | result = runner.invoke(cli, ["gc", "--grace-period", "-1"], env=_env(root)) |
| 476 | assert result.exit_code != 0 |
| 477 | |
| 478 | |
| 479 | # --------------------------------------------------------------------------- |
| 480 | # Integration — JSON output schema |
| 481 | # --------------------------------------------------------------------------- |
| 482 | |
| 483 | |
| 484 | class TestJsonSchema: |
| 485 | def test_json_schema_all_fields_present(self, tmp_path: pathlib.Path) -> None: |
| 486 | root = _make_repo(tmp_path) |
| 487 | _make_commit(root) |
| 488 | _write_object(root, b"orphan for json test") |
| 489 | result = _invoke_gc(root, "--json") |
| 490 | assert result.exit_code == 0 |
| 491 | payload = _parse_gc_json(result.output) |
| 492 | assert payload["collected_count"] == 1 |
| 493 | assert payload["collected_bytes"] > 0 |
| 494 | # commit + snapshot now live in the unified object store, so reachable_count >= 2 |
| 495 | assert payload["reachable_count"] >= 2 |
| 496 | assert payload["duration_ms"] >= 0.0 |
| 497 | assert payload["grace_period_seconds"] == 0 |
| 498 | assert payload["dry_run"] is False |
| 499 | assert len(payload["collected_ids"]) == 1 |
| 500 | |
| 501 | def test_json_dry_run_does_not_delete(self, tmp_path: pathlib.Path) -> None: |
| 502 | root = _make_repo(tmp_path) |
| 503 | _make_commit(root) |
| 504 | orphan_id = _write_object(root, b"dry run orphan") |
| 505 | result = _invoke_gc(root, "--dry-run", "--json") |
| 506 | assert result.exit_code == 0 |
| 507 | payload = _parse_gc_json(result.output) |
| 508 | assert payload["dry_run"] is True |
| 509 | assert payload["collected_count"] == 1 |
| 510 | # File must still exist. |
| 511 | from muse.core.object_store import has_object |
| 512 | assert has_object(root, orphan_id) |
| 513 | |
| 514 | def test_json_grace_period_field_reflects_flag(self, tmp_path: pathlib.Path) -> None: |
| 515 | root = _make_repo(tmp_path) |
| 516 | result = runner.invoke( |
| 517 | cli, ["gc", "--grace-period", "99", "--json"], |
| 518 | env=_env(root), catch_exceptions=False, |
| 519 | ) |
| 520 | assert result.exit_code == 0 |
| 521 | payload = _parse_gc_json(result.output) |
| 522 | assert payload["grace_period_seconds"] == 99 |
| 523 | |
| 524 | def test_json_collected_ids_sorted(self, tmp_path: pathlib.Path) -> None: |
| 525 | root = _make_repo(tmp_path) |
| 526 | for i in range(5): |
| 527 | _write_object(root, f"sort test {i}".encode()) |
| 528 | result = _invoke_gc(root, "--json") |
| 529 | assert result.exit_code == 0 |
| 530 | payload = _parse_gc_json(result.output) |
| 531 | assert payload["collected_ids"] == sorted(payload["collected_ids"]) |
| 532 | |
| 533 | def test_json_clean_repo_shows_zero_counts(self, tmp_path: pathlib.Path) -> None: |
| 534 | root = _make_repo(tmp_path) |
| 535 | _make_commit(root) |
| 536 | result = _invoke_gc(root, "--json") |
| 537 | assert result.exit_code == 0 |
| 538 | payload = _parse_gc_json(result.output) |
| 539 | assert payload["collected_count"] == 0 |
| 540 | assert payload["collected_bytes"] == 0 |
| 541 | assert payload["collected_ids"] == [] |
| 542 | |
| 543 | def test_shorthand_json_flag(self, tmp_path: pathlib.Path) -> None: |
| 544 | root = _make_repo(tmp_path) |
| 545 | result = _invoke_gc(root, "--json") |
| 546 | assert result.exit_code == 0 |
| 547 | _parse_gc_json(result.output) # must not raise |
| 548 | |
| 549 | |
| 550 | # --------------------------------------------------------------------------- |
| 551 | # E2E — full lifecycle |
| 552 | # --------------------------------------------------------------------------- |
| 553 | |
| 554 | |
| 555 | class TestE2E: |
| 556 | def test_orphan_from_abandoned_branch_reclaimed(self, tmp_path: pathlib.Path) -> None: |
| 557 | """Objects written for a branch that was never committed are reclaimed.""" |
| 558 | root = _make_repo(tmp_path) |
| 559 | # Write objects that were staged but never committed. |
| 560 | orphan_a = _write_object(root, b"branch work A") |
| 561 | orphan_b = _write_object(root, b"branch work B") |
| 562 | # Now run GC. |
| 563 | result = _invoke_gc(root, "--json") |
| 564 | assert result.exit_code == 0 |
| 565 | payload = _parse_gc_json(result.output) |
| 566 | assert orphan_a in payload["collected_ids"] |
| 567 | assert orphan_b in payload["collected_ids"] |
| 568 | |
| 569 | def test_gc_after_shelf_save_preserves_shelf_objects(self, tmp_path: pathlib.Path) -> None: |
| 570 | root = _make_repo(tmp_path) |
| 571 | shelf_obj = _write_object(root, b"shelved file content") |
| 572 | _write_shelf_entry(root, {"file.py": shelf_obj}) |
| 573 | |
| 574 | result = _invoke_gc(root, "--json") |
| 575 | assert result.exit_code == 0 |
| 576 | payload = _parse_gc_json(result.output) |
| 577 | assert shelf_obj not in payload["collected_ids"] |
| 578 | # Blob must still be on disk. |
| 579 | from muse.core.object_store import has_object |
| 580 | assert has_object(root, shelf_obj) |
| 581 | |
| 582 | def test_gc_with_corrupt_shelf_json_succeeds(self, tmp_path: pathlib.Path) -> None: |
| 583 | root = _make_repo(tmp_path) |
| 584 | orphan = _write_object(root, b"orphan despite corrupt shelf") |
| 585 | (muse_dir(root) / "shelf.json").write_text("{not json", encoding="utf-8") |
| 586 | result = _invoke_gc(root, "--json") |
| 587 | assert result.exit_code == 0 |
| 588 | payload = _parse_gc_json(result.output) |
| 589 | # Orphan is still collected even though shelf was corrupt. |
| 590 | assert orphan in payload["collected_ids"] |
| 591 | |
| 592 | def test_grace_period_zero_collects_fresh_orphan(self, tmp_path: pathlib.Path) -> None: |
| 593 | root = _make_repo(tmp_path) |
| 594 | orphan = _write_object(root, b"fresh orphan") |
| 595 | result = _invoke_gc(root, "--grace-period", "0", "--json") |
| 596 | assert result.exit_code == 0 |
| 597 | payload = _parse_gc_json(result.output) |
| 598 | assert orphan in payload["collected_ids"] |
| 599 | |
| 600 | def test_grace_period_large_protects_fresh_orphan(self, tmp_path: pathlib.Path) -> None: |
| 601 | root = _make_repo(tmp_path) |
| 602 | _write_object(root, b"fresh orphan protected") |
| 603 | result = runner.invoke( |
| 604 | cli, ["gc", "--grace-period", "9999", "--json"], |
| 605 | env=_env(root), catch_exceptions=False, |
| 606 | ) |
| 607 | assert result.exit_code == 0 |
| 608 | payload = _parse_gc_json(result.output) |
| 609 | assert payload["collected_count"] == 0 |
| 610 | |
| 611 | def test_repeated_gc_is_idempotent(self, tmp_path: pathlib.Path) -> None: |
| 612 | root = _make_repo(tmp_path) |
| 613 | _write_object(root, b"first orphan") |
| 614 | _invoke_gc(root) |
| 615 | result2 = _invoke_gc(root, "--json") |
| 616 | assert result2.exit_code == 0 |
| 617 | payload = _parse_gc_json(result2.output) |
| 618 | assert payload["collected_count"] == 0 |
| 619 | |
| 620 | def test_gc_removes_empty_prefix_dirs(self, tmp_path: pathlib.Path) -> None: |
| 621 | """After GC, empty prefix dirs under .muse/objects/sha256/ are cleaned up.""" |
| 622 | root = _make_repo(tmp_path) |
| 623 | sha = _write_object(root, b"sole object in prefix") |
| 624 | from muse.core.object_store import object_path |
| 625 | prefix_dir = object_path(root, sha).parent |
| 626 | assert prefix_dir.exists() |
| 627 | _invoke_gc(root) |
| 628 | # Directory should be removed since it's empty now. |
| 629 | assert not prefix_dir.exists() |
| 630 | |
| 631 | def test_verbose_lists_full_sha256_ids(self, tmp_path: pathlib.Path) -> None: |
| 632 | root = _make_repo(tmp_path) |
| 633 | orphan = _write_object(root, b"verbose test object") |
| 634 | result = _invoke_gc(root, "--verbose") |
| 635 | assert result.exit_code == 0 |
| 636 | assert orphan in result.output |
| 637 | |
| 638 | def test_dry_run_verbose_lists_without_deleting(self, tmp_path: pathlib.Path) -> None: |
| 639 | root = _make_repo(tmp_path) |
| 640 | orphan = _write_object(root, b"dry verbose test") |
| 641 | result = _invoke_gc(root, "--dry-run", "--verbose") |
| 642 | assert result.exit_code == 0 |
| 643 | assert orphan in result.output |
| 644 | from muse.core.object_store import object_path |
| 645 | assert object_path(root, orphan).exists() |
| 646 | |
| 647 | def test_dry_run_prefix_present_in_text_output(self, tmp_path: pathlib.Path) -> None: |
| 648 | root = _make_repo(tmp_path) |
| 649 | result = _invoke_gc(root, "--dry-run") |
| 650 | assert result.exit_code == 0 |
| 651 | assert "[dry-run]" in result.output |
| 652 | |
| 653 | def test_reachable_count_reflects_committed_objects(self, tmp_path: pathlib.Path) -> None: |
| 654 | root = _make_repo(tmp_path) |
| 655 | obj = _write_object(root, b"committed content") |
| 656 | _make_commit(root, manifest={"file.txt": obj}) |
| 657 | result = _invoke_gc(root, "--json") |
| 658 | payload = _parse_gc_json(result.output) |
| 659 | # commit + snapshot + 1 blob = 3 reachable in the unified object store |
| 660 | assert payload["reachable_count"] == 3 |
| 661 | assert payload["collected_count"] == 0 |
| 662 | |
| 663 | |
| 664 | # --------------------------------------------------------------------------- |
| 665 | # Stress |
| 666 | # --------------------------------------------------------------------------- |
| 667 | |
| 668 | |
| 669 | class TestStress: |
| 670 | def test_500_orphans_all_collected(self, tmp_path: pathlib.Path) -> None: |
| 671 | root = _make_repo(tmp_path) |
| 672 | _make_commit(root) |
| 673 | orphan_ids = [_write_object(root, f"stress-{i:04d}".encode()) for i in range(500)] |
| 674 | result = _invoke_gc(root, "--json") |
| 675 | assert result.exit_code == 0 |
| 676 | payload = _parse_gc_json(result.output) |
| 677 | assert payload["collected_count"] == 500 |
| 678 | assert set(payload["collected_ids"]) == set(orphan_ids) |
| 679 | # All orphan blobs must be gone; commit + snapshot remain (reachable). |
| 680 | from muse.core.object_store import has_object |
| 681 | for oid in orphan_ids: |
| 682 | assert not has_object(root, oid), f"orphan {oid} was not collected" |
| 683 | # Only the commit and snapshot objects remain in the store. |
| 684 | obj_dir = objects_dir(root) |
| 685 | remaining_files = [p for p in obj_dir.rglob("*") if p.is_file()] |
| 686 | assert len(remaining_files) == 2 |
| 687 | |
| 688 | def test_concurrent_dry_run_does_not_crash(self, tmp_path: pathlib.Path) -> None: |
| 689 | """Multiple concurrent dry-run GCs on the same repo must not crash.""" |
| 690 | root = _make_repo(tmp_path) |
| 691 | _make_commit(root) |
| 692 | for i in range(20): |
| 693 | _write_object(root, f"concurrent-orphan-{i}".encode()) |
| 694 | |
| 695 | errors: list[str] = [] |
| 696 | |
| 697 | def _run_dry() -> None: |
| 698 | try: |
| 699 | from muse.core.gc import run_gc |
| 700 | run_gc(root, dry_run=True, grace_period_seconds=0) |
| 701 | except Exception as exc: |
| 702 | errors.append(str(exc)) |
| 703 | |
| 704 | threads = [threading.Thread(target=_run_dry) for _ in range(8)] |
| 705 | for t in threads: |
| 706 | t.start() |
| 707 | for t in threads: |
| 708 | t.join() |
| 709 | |
| 710 | assert not errors, f"Concurrent dry-run GC failures: {errors}" |
| 711 | |
| 712 | def test_gc_across_many_prefix_dirs(self, tmp_path: pathlib.Path) -> None: |
| 713 | """Objects spread across many prefix dirs are all found and collected.""" |
| 714 | root = _make_repo(tmp_path) |
| 715 | # Force objects into many distinct prefix dirs by varying content. |
| 716 | ids: list[str] = [] |
| 717 | for i in range(100): |
| 718 | ids.append(_write_object(root, f"spread-{i:08d}".encode())) |
| 719 | # Verify we have multiple prefix dirs. |
| 720 | algo_dir = objects_dir(root) / "sha256" |
| 721 | prefix_dirs = [d for d in algo_dir.iterdir() if d.is_dir()] |
| 722 | assert len(prefix_dirs) > 1, "Test needs objects in multiple prefix dirs" |
| 723 | |
| 724 | result = _invoke_gc(root, "--json") |
| 725 | payload = _parse_gc_json(result.output) |
| 726 | assert payload["collected_count"] == 100 |
| 727 | assert set(payload["collected_ids"]) == set(ids) |
File History
1 commit
sha256:2eaa5d95f9d9383498e76947410a26e5a3ba23d182f339910c424cf88fad412b
fix: try fetch/presign before fetch/mpack to avoid Cloudfla…
Sonnet 4.6
patch
7 days ago