gabriel / muse public
test_push_object_delta.py python
364 lines 13.5 KB
Raw
1 """TDD — push only sends objects that are genuinely new.
2
3 Root cause
4 ----------
5 ``walk_commits`` and ``collect_blob_ids`` collect ALL objects from the
6 snapshots of new commits without subtracting objects already present in the
7 ``have`` commits' snapshots.
8
9 A snapshot is a full manifest of the repo state at a point in time — it
10 includes every file, not just changed ones. So for a 900-object repo, 1 new
11 commit still sends all 900 objects instead of just the 1–2 that changed.
12
13 The fix: subtract objects reachable from ``have`` commits' snapshots.
14
15 new_objects = objects_in_new_snapshots − objects_in_have_snapshots
16
17 Coverage
18 --------
19 I Unit — collect_blob_ids: unchanged objects excluded when have is set
20 II Unit — collect_blob_ids: new object (not in have snapshot) is included
21 III Unit — collect_blob_ids: object removed in new commit is excluded
22 IV Unit — walk_commits: all_blob_ids obeys the same delta semantics
23 V Unit — multi-file repo: 1 changed file → 1 object sent, not all files
24 VI Integration — 10-file repo, 9 unchanged, 1 changed → only 1 object pushed
25 VII Regression — have=[] sends all objects (no regression)
26 VIII Regression — have commit with no local snapshot handled gracefully
27 """
28 from __future__ import annotations
29
30 import datetime
31 import json
32 import pathlib
33
34 import pytest
35
36 from muse._version import __version__
37 from muse.core.object_store import write_object
38 from muse.core.mpack import collect_blob_ids, walk_commits
39 from muse.core.ids import hash_commit as compute_commit_id, hash_snapshot as compute_snapshot_id
40 from muse.core.commits import (
41 CommitRecord,
42 write_commit,
43 )
44 from muse.core.snapshots import (
45 SnapshotRecord,
46 write_snapshot,
47 )
48 from muse.core.types import Manifest, blob_id
49 from muse.core.paths import muse_dir
50
51
52 # ---------------------------------------------------------------------------
53 # Helpers
54 # ---------------------------------------------------------------------------
55
56
57 def _oid(content: bytes) -> str:
58 return blob_id(content)
59
60
61 def _repo(tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch) -> pathlib.Path:
62 dot_muse = muse_dir(tmp_path)
63 for d in ("commits", "snapshots", "objects", "refs/heads", "remotes"):
64 (dot_muse / d).mkdir(parents=True, exist_ok=True)
65 (dot_muse / "HEAD").write_text("ref: refs/heads/main\n")
66 (dot_muse / "repo.json").write_text(
67 json.dumps({"repo_id": "test-repo", "schema_version": __version__, "domain": "code"})
68 )
69 (dot_muse / "config.toml").write_text('[remotes.origin]\nurl = "https://hub.example.com/r"\n')
70 monkeypatch.setenv("MUSE_REPO_ROOT", str(tmp_path))
71 monkeypatch.chdir(tmp_path)
72 return tmp_path
73
74
75 def _write_commit(
76 root: pathlib.Path,
77 manifest: Manifest,
78 *,
79 parent_id: str | None = None,
80 ) -> CommitRecord:
81 """Write objects, snapshot, and commit; return the CommitRecord."""
82 for oid, raw in [(oid, None) for oid in manifest.values()]:
83 # objects were written by the caller via _write_object
84 pass
85 snap_id = compute_snapshot_id(manifest)
86 write_snapshot(root, SnapshotRecord(snapshot_id=snap_id, manifest=manifest))
87 ts = datetime.datetime(2026, 1, 1, tzinfo=datetime.timezone.utc)
88 parent_ids = [parent_id] if parent_id else []
89 cid = compute_commit_id(
90 parent_ids=parent_ids,
91 snapshot_id=snap_id,
92 message="test",
93 committed_at_iso=ts.isoformat(),
94 )
95 commit = CommitRecord(
96 commit_id=cid,
97 branch="main",
98 snapshot_id=snap_id,
99 message="test",
100 committed_at=ts,
101 parent_commit_id=parent_id,
102 )
103 write_commit(root, commit)
104 return commit
105
106
107 def _write_object(root: pathlib.Path, content: bytes) -> str:
108 oid = _oid(content)
109 write_object(root, oid, content)
110 return oid
111
112
113 # ---------------------------------------------------------------------------
114 # I — unchanged objects are excluded when have is set
115 # ---------------------------------------------------------------------------
116
117
118 class TestCollectObjectIdsDelta:
119 def test_unchanged_object_excluded_when_in_have_snapshot(
120 self, tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch
121 ) -> None:
122 """Object present in both have-commit and new-commit snapshot → not sent."""
123 root = _repo(tmp_path, monkeypatch)
124 unchanged = _write_object(root, b"unchanged file")
125
126 # Commit A (the server has this)
127 commit_a = _write_commit(root, {"file.txt": unchanged})
128
129 # Commit B (new — same file, no changes)
130 commit_b = _write_commit(root, {"file.txt": unchanged}, parent_id=commit_a.commit_id)
131
132 result = collect_blob_ids(root, [commit_b.commit_id], have=[commit_a.commit_id])
133
134 assert unchanged not in result, (
135 "Object present in have-snapshot must not be re-sent"
136 )
137
138 def test_new_object_included_when_not_in_have_snapshot(
139 self, tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch
140 ) -> None:
141 """Object only in new-commit snapshot → must be sent."""
142 root = _repo(tmp_path, monkeypatch)
143 old_file = _write_object(root, b"old file content")
144 new_file = _write_object(root, b"brand new file")
145
146 commit_a = _write_commit(root, {"old.txt": old_file})
147 commit_b = _write_commit(
148 root,
149 {"old.txt": old_file, "new.txt": new_file},
150 parent_id=commit_a.commit_id,
151 )
152
153 result = collect_blob_ids(root, [commit_b.commit_id], have=[commit_a.commit_id])
154
155 assert new_file in result, "New object not in have-snapshot must be sent"
156 assert old_file not in result, "Object already in have-snapshot must not be sent"
157
158 def test_removed_object_excluded(
159 self, tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch
160 ) -> None:
161 """Object present in have-snapshot but deleted in new commit → not sent."""
162 root = _repo(tmp_path, monkeypatch)
163 kept = _write_object(root, b"kept file")
164 removed = _write_object(root, b"file that gets deleted")
165
166 commit_a = _write_commit(root, {"kept.txt": kept, "gone.txt": removed})
167 # Commit B removes gone.txt
168 commit_b = _write_commit(root, {"kept.txt": kept}, parent_id=commit_a.commit_id)
169
170 result = collect_blob_ids(root, [commit_b.commit_id], have=[commit_a.commit_id])
171
172 assert removed not in result
173 assert kept not in result # still unchanged
174
175 def test_empty_delta_when_no_changes(
176 self, tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch
177 ) -> None:
178 """Identical snapshot in new commit → zero objects sent."""
179 root = _repo(tmp_path, monkeypatch)
180 obj = _write_object(root, b"content")
181
182 commit_a = _write_commit(root, {"f.txt": obj})
183 # Commit B — identical snapshot (content unchanged)
184 commit_b = _write_commit(root, {"f.txt": obj}, parent_id=commit_a.commit_id)
185
186 result = collect_blob_ids(root, [commit_b.commit_id], have=[commit_a.commit_id])
187
188 assert result == [], f"Expected no objects to send, got {result}"
189
190
191 # ---------------------------------------------------------------------------
192 # IV — walk_commits obeys the same delta semantics
193 # ---------------------------------------------------------------------------
194
195
196 class TestWalkCommitsDelta:
197 def test_walk_all_blob_ids_excludes_have_objects(
198 self, tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch
199 ) -> None:
200 """walk_commits.all_blob_ids must subtract have-snapshot objects."""
201 root = _repo(tmp_path, monkeypatch)
202 shared = _write_object(root, b"shared across commits")
203 new_obj = _write_object(root, b"only in new commit")
204
205 commit_a = _write_commit(root, {"shared.txt": shared})
206 commit_b = _write_commit(
207 root,
208 {"shared.txt": shared, "new.txt": new_obj},
209 parent_id=commit_a.commit_id,
210 )
211
212 walk = walk_commits(root, [commit_b.commit_id], have=[commit_a.commit_id])
213
214 assert new_obj in walk["all_blob_ids"]
215 assert shared not in walk["all_blob_ids"], (
216 "walk_commits must exclude objects already in have-snapshot"
217 )
218
219
220 # ---------------------------------------------------------------------------
221 # V — multi-file repo: only the changed file is sent
222 # ---------------------------------------------------------------------------
223
224
225 class TestMultiFileDelta:
226 def test_only_changed_file_sent_in_10_file_repo(
227 self, tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch
228 ) -> None:
229 """10-file repo: 9 unchanged + 1 modified → only 1 object sent."""
230 root = _repo(tmp_path, monkeypatch)
231
232 # Create 10 files in commit A
233 files_a: Manifest = {}
234 for i in range(10):
235 content = f"file {i} original content".encode()
236 oid = _write_object(root, content)
237 files_a[f"file{i:02d}.mid"] = oid
238
239 commit_a = _write_commit(root, files_a)
240
241 # Commit B: modify only file05.mid
242 files_b = dict(files_a)
243 modified_oid = _write_object(root, b"file 5 modified content")
244 files_b["file05.mid"] = modified_oid
245
246 commit_b = _write_commit(root, files_b, parent_id=commit_a.commit_id)
247
248 result = collect_blob_ids(root, [commit_b.commit_id], have=[commit_a.commit_id])
249
250 assert result == [modified_oid], (
251 f"Expected only 1 modified object, got {len(result)}: {result}"
252 )
253
254
255 # ---------------------------------------------------------------------------
256 # VI — integration: 1 added file in large repo
257 # ---------------------------------------------------------------------------
258
259
260 class TestLargeRepoDelta:
261 def test_one_added_file_sends_one_object(
262 self, tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch
263 ) -> None:
264 """100-file repo, add 1 new file → 1 object sent."""
265 root = _repo(tmp_path, monkeypatch)
266
267 files_a: Manifest = {}
268 for i in range(100):
269 oid = _write_object(root, f"track {i} content".encode())
270 files_a[f"track{i:03d}.mid"] = oid
271
272 commit_a = _write_commit(root, files_a)
273
274 # Add one new file
275 new_oid = _write_object(root, b"brand new track content")
276 files_b = {**files_a, "new_track.mid": new_oid}
277 commit_b = _write_commit(root, files_b, parent_id=commit_a.commit_id)
278
279 result = collect_blob_ids(root, [commit_b.commit_id], have=[commit_a.commit_id])
280
281 assert result == [new_oid], (
282 f"Expected exactly 1 new object, got {len(result)}"
283 )
284
285
286 # ---------------------------------------------------------------------------
287 # VII — regression: have=[] sends all objects
288 # ---------------------------------------------------------------------------
289
290
291 class TestNoHaveRegression:
292 def test_no_have_sends_all_objects(
293 self, tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch
294 ) -> None:
295 """Without have, all objects in the commit graph are returned."""
296 root = _repo(tmp_path, monkeypatch)
297 obj1 = _write_object(root, b"obj1")
298 obj2 = _write_object(root, b"obj2")
299
300 commit_a = _write_commit(root, {"a.txt": obj1})
301 commit_b = _write_commit(root, {"a.txt": obj1, "b.txt": obj2}, parent_id=commit_a.commit_id)
302
303 result = collect_blob_ids(root, [commit_b.commit_id], have=[])
304
305 assert obj1 in result
306 assert obj2 in result
307
308
309 # ---------------------------------------------------------------------------
310 # VIII — graceful handling: have commit has no local snapshot
311 # ---------------------------------------------------------------------------
312
313
314 class TestMissingHaveSnapshot:
315 def test_missing_have_snapshot_treated_as_no_have(
316 self, tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch
317 ) -> None:
318 """If a have-commit's snapshot isn't local, don't crash — send the objects."""
319 root = _repo(tmp_path, monkeypatch)
320 obj = _write_object(root, b"some object")
321
322 # Write only a commit record without writing its snapshot locally
323 snap_id = compute_snapshot_id({"f.txt": obj})
324 ts = datetime.datetime(2026, 1, 1, tzinfo=datetime.timezone.utc)
325 fake_have_cid = compute_commit_id(
326 parent_ids=[],
327 snapshot_id=snap_id,
328 message="phantom",
329 committed_at_iso=ts.isoformat(),
330 )
331 phantom_commit = CommitRecord(
332 commit_id=fake_have_cid,
333 branch="main",
334 snapshot_id=snap_id,
335 message="phantom",
336 committed_at=ts,
337 )
338 write_commit(root, phantom_commit)
339 # Note: snapshot is NOT written locally
340
341 new_obj = _write_object(root, b"new object")
342 snap2_id = compute_snapshot_id({"f.txt": obj, "g.txt": new_obj})
343 write_snapshot(root, SnapshotRecord(snapshot_id=snap2_id, manifest={"f.txt": obj, "g.txt": new_obj}))
344 write_object(root, obj, b"some object")
345 cid2 = compute_commit_id(
346 parent_ids=[fake_have_cid],
347 snapshot_id=snap2_id,
348 message="real",
349 committed_at_iso=ts.isoformat(),
350 )
351 commit2 = CommitRecord(
352 commit_id=cid2,
353 branch="main",
354 snapshot_id=snap2_id,
355 message="real",
356 committed_at=ts,
357 parent_commit_id=fake_have_cid,
358 )
359 write_commit(root, commit2)
360
361 # Should not crash; since have-snapshot is missing, objects may be over-sent
362 # but must not raise
363 result = collect_blob_ids(root, [cid2], have=[fake_have_cid])
364 assert isinstance(result, list)
File History 1 commit