"""TDD — push only sends objects that are genuinely new. Root cause ---------- ``walk_commits`` and ``collect_blob_ids`` collect ALL objects from the snapshots of new commits without subtracting objects already present in the ``have`` commits' snapshots. A snapshot is a full manifest of the repo state at a point in time — it includes every file, not just changed ones. So for a 900-object repo, 1 new commit still sends all 900 objects instead of just the 1–2 that changed. The fix: subtract objects reachable from ``have`` commits' snapshots. new_objects = objects_in_new_snapshots − objects_in_have_snapshots Coverage -------- I Unit — collect_blob_ids: unchanged objects excluded when have is set II Unit — collect_blob_ids: new object (not in have snapshot) is included III Unit — collect_blob_ids: object removed in new commit is excluded IV Unit — walk_commits: all_blob_ids obeys the same delta semantics V Unit — multi-file repo: 1 changed file → 1 object sent, not all files VI Integration — 10-file repo, 9 unchanged, 1 changed → only 1 object pushed VII Regression — have=[] sends all objects (no regression) VIII Regression — have commit with no local snapshot handled gracefully """ from __future__ import annotations import datetime import json import pathlib import pytest from muse._version import __version__ from muse.core.object_store import write_object from muse.core.mpack import collect_blob_ids, walk_commits from muse.core.ids import hash_commit as compute_commit_id, hash_snapshot as compute_snapshot_id from muse.core.commits import ( CommitRecord, write_commit, ) from muse.core.snapshots import ( SnapshotRecord, write_snapshot, ) from muse.core.types import Manifest, blob_id from muse.core.paths import muse_dir # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _oid(content: bytes) -> str: return blob_id(content) def _repo(tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch) -> pathlib.Path: dot_muse = muse_dir(tmp_path) for d in ("commits", "snapshots", "objects", "refs/heads", "remotes"): (dot_muse / d).mkdir(parents=True, exist_ok=True) (dot_muse / "HEAD").write_text("ref: refs/heads/main\n") (dot_muse / "repo.json").write_text( json.dumps({"repo_id": "test-repo", "schema_version": __version__, "domain": "code"}) ) (dot_muse / "config.toml").write_text('[remotes.origin]\nurl = "https://hub.example.com/r"\n') monkeypatch.setenv("MUSE_REPO_ROOT", str(tmp_path)) monkeypatch.chdir(tmp_path) return tmp_path def _write_commit( root: pathlib.Path, manifest: Manifest, *, parent_id: str | None = None, ) -> CommitRecord: """Write objects, snapshot, and commit; return the CommitRecord.""" for oid, raw in [(oid, None) for oid in manifest.values()]: # objects were written by the caller via _write_object pass snap_id = compute_snapshot_id(manifest) write_snapshot(root, SnapshotRecord(snapshot_id=snap_id, manifest=manifest)) ts = datetime.datetime(2026, 1, 1, tzinfo=datetime.timezone.utc) parent_ids = [parent_id] if parent_id else [] cid = compute_commit_id( parent_ids=parent_ids, snapshot_id=snap_id, message="test", committed_at_iso=ts.isoformat(), ) commit = CommitRecord( commit_id=cid, branch="main", snapshot_id=snap_id, message="test", committed_at=ts, parent_commit_id=parent_id, ) write_commit(root, commit) return commit def _write_object(root: pathlib.Path, content: bytes) -> str: oid = _oid(content) write_object(root, oid, content) return oid # --------------------------------------------------------------------------- # I — unchanged objects are excluded when have is set # --------------------------------------------------------------------------- class TestCollectObjectIdsDelta: def test_unchanged_object_excluded_when_in_have_snapshot( self, tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch ) -> None: """Object present in both have-commit and new-commit snapshot → not sent.""" root = _repo(tmp_path, monkeypatch) unchanged = _write_object(root, b"unchanged file") # Commit A (the server has this) commit_a = _write_commit(root, {"file.txt": unchanged}) # Commit B (new — same file, no changes) commit_b = _write_commit(root, {"file.txt": unchanged}, parent_id=commit_a.commit_id) result = collect_blob_ids(root, [commit_b.commit_id], have=[commit_a.commit_id]) assert unchanged not in result, ( "Object present in have-snapshot must not be re-sent" ) def test_new_object_included_when_not_in_have_snapshot( self, tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch ) -> None: """Object only in new-commit snapshot → must be sent.""" root = _repo(tmp_path, monkeypatch) old_file = _write_object(root, b"old file content") new_file = _write_object(root, b"brand new file") commit_a = _write_commit(root, {"old.txt": old_file}) commit_b = _write_commit( root, {"old.txt": old_file, "new.txt": new_file}, parent_id=commit_a.commit_id, ) result = collect_blob_ids(root, [commit_b.commit_id], have=[commit_a.commit_id]) assert new_file in result, "New object not in have-snapshot must be sent" assert old_file not in result, "Object already in have-snapshot must not be sent" def test_removed_object_excluded( self, tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch ) -> None: """Object present in have-snapshot but deleted in new commit → not sent.""" root = _repo(tmp_path, monkeypatch) kept = _write_object(root, b"kept file") removed = _write_object(root, b"file that gets deleted") commit_a = _write_commit(root, {"kept.txt": kept, "gone.txt": removed}) # Commit B removes gone.txt commit_b = _write_commit(root, {"kept.txt": kept}, parent_id=commit_a.commit_id) result = collect_blob_ids(root, [commit_b.commit_id], have=[commit_a.commit_id]) assert removed not in result assert kept not in result # still unchanged def test_empty_delta_when_no_changes( self, tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch ) -> None: """Identical snapshot in new commit → zero objects sent.""" root = _repo(tmp_path, monkeypatch) obj = _write_object(root, b"content") commit_a = _write_commit(root, {"f.txt": obj}) # Commit B — identical snapshot (content unchanged) commit_b = _write_commit(root, {"f.txt": obj}, parent_id=commit_a.commit_id) result = collect_blob_ids(root, [commit_b.commit_id], have=[commit_a.commit_id]) assert result == [], f"Expected no objects to send, got {result}" # --------------------------------------------------------------------------- # IV — walk_commits obeys the same delta semantics # --------------------------------------------------------------------------- class TestWalkCommitsDelta: def test_walk_all_blob_ids_excludes_have_objects( self, tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch ) -> None: """walk_commits.all_blob_ids must subtract have-snapshot objects.""" root = _repo(tmp_path, monkeypatch) shared = _write_object(root, b"shared across commits") new_obj = _write_object(root, b"only in new commit") commit_a = _write_commit(root, {"shared.txt": shared}) commit_b = _write_commit( root, {"shared.txt": shared, "new.txt": new_obj}, parent_id=commit_a.commit_id, ) walk = walk_commits(root, [commit_b.commit_id], have=[commit_a.commit_id]) assert new_obj in walk["all_blob_ids"] assert shared not in walk["all_blob_ids"], ( "walk_commits must exclude objects already in have-snapshot" ) # --------------------------------------------------------------------------- # V — multi-file repo: only the changed file is sent # --------------------------------------------------------------------------- class TestMultiFileDelta: def test_only_changed_file_sent_in_10_file_repo( self, tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch ) -> None: """10-file repo: 9 unchanged + 1 modified → only 1 object sent.""" root = _repo(tmp_path, monkeypatch) # Create 10 files in commit A files_a: Manifest = {} for i in range(10): content = f"file {i} original content".encode() oid = _write_object(root, content) files_a[f"file{i:02d}.mid"] = oid commit_a = _write_commit(root, files_a) # Commit B: modify only file05.mid files_b = dict(files_a) modified_oid = _write_object(root, b"file 5 modified content") files_b["file05.mid"] = modified_oid commit_b = _write_commit(root, files_b, parent_id=commit_a.commit_id) result = collect_blob_ids(root, [commit_b.commit_id], have=[commit_a.commit_id]) assert result == [modified_oid], ( f"Expected only 1 modified object, got {len(result)}: {result}" ) # --------------------------------------------------------------------------- # VI — integration: 1 added file in large repo # --------------------------------------------------------------------------- class TestLargeRepoDelta: def test_one_added_file_sends_one_object( self, tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch ) -> None: """100-file repo, add 1 new file → 1 object sent.""" root = _repo(tmp_path, monkeypatch) files_a: Manifest = {} for i in range(100): oid = _write_object(root, f"track {i} content".encode()) files_a[f"track{i:03d}.mid"] = oid commit_a = _write_commit(root, files_a) # Add one new file new_oid = _write_object(root, b"brand new track content") files_b = {**files_a, "new_track.mid": new_oid} commit_b = _write_commit(root, files_b, parent_id=commit_a.commit_id) result = collect_blob_ids(root, [commit_b.commit_id], have=[commit_a.commit_id]) assert result == [new_oid], ( f"Expected exactly 1 new object, got {len(result)}" ) # --------------------------------------------------------------------------- # VII — regression: have=[] sends all objects # --------------------------------------------------------------------------- class TestNoHaveRegression: def test_no_have_sends_all_objects( self, tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch ) -> None: """Without have, all objects in the commit graph are returned.""" root = _repo(tmp_path, monkeypatch) obj1 = _write_object(root, b"obj1") obj2 = _write_object(root, b"obj2") commit_a = _write_commit(root, {"a.txt": obj1}) commit_b = _write_commit(root, {"a.txt": obj1, "b.txt": obj2}, parent_id=commit_a.commit_id) result = collect_blob_ids(root, [commit_b.commit_id], have=[]) assert obj1 in result assert obj2 in result # --------------------------------------------------------------------------- # VIII — graceful handling: have commit has no local snapshot # --------------------------------------------------------------------------- class TestMissingHaveSnapshot: def test_missing_have_snapshot_treated_as_no_have( self, tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch ) -> None: """If a have-commit's snapshot isn't local, don't crash — send the objects.""" root = _repo(tmp_path, monkeypatch) obj = _write_object(root, b"some object") # Write only a commit record without writing its snapshot locally snap_id = compute_snapshot_id({"f.txt": obj}) ts = datetime.datetime(2026, 1, 1, tzinfo=datetime.timezone.utc) fake_have_cid = compute_commit_id( parent_ids=[], snapshot_id=snap_id, message="phantom", committed_at_iso=ts.isoformat(), ) phantom_commit = CommitRecord( commit_id=fake_have_cid, branch="main", snapshot_id=snap_id, message="phantom", committed_at=ts, ) write_commit(root, phantom_commit) # Note: snapshot is NOT written locally new_obj = _write_object(root, b"new object") snap2_id = compute_snapshot_id({"f.txt": obj, "g.txt": new_obj}) write_snapshot(root, SnapshotRecord(snapshot_id=snap2_id, manifest={"f.txt": obj, "g.txt": new_obj})) write_object(root, obj, b"some object") cid2 = compute_commit_id( parent_ids=[fake_have_cid], snapshot_id=snap2_id, message="real", committed_at_iso=ts.isoformat(), ) commit2 = CommitRecord( commit_id=cid2, branch="main", snapshot_id=snap2_id, message="real", committed_at=ts, parent_commit_id=fake_have_cid, ) write_commit(root, commit2) # Should not crash; since have-snapshot is missing, objects may be over-sent # but must not raise result = collect_blob_ids(root, [cid2], have=[fake_have_cid]) assert isinstance(result, list)