gabriel / muse public
test_push_snapshot_loading.py python
233 lines 8.3 KB
Raw
sha256:81ae324db5ad375fbfe4834c6fcb378312cafad3cc92dec5d3e5c427306621a2 fix: remove commit_exists filter from have anchors — server… Sonnet 4.6 patch 20 days ago
1 """TDD — mpack push path must not read snapshot manifests redundantly.
2
3 Today the mpack path reads 718K manifest entries three times:
4 1. snapshots_list loading — full manifests, redundant on the mpack path
5 2. collect_blob_ids — full manifests, used only for progress count
6 3. build_mpack/_build_snapshot_deltas — full manifests (the needed pass)
7
8 Passes 1 and 2 are dead work on the mpack path.
9
10 The fix: a new function collect_blob_ids_from_deltas() extracts object IDs
11 directly from the already-computed delta list — zero additional disk reads.
12 The mpack push path calls _build_snapshot_deltas once, then derives both
13 the wire delta list AND the object ID set from that single pass.
14
15 Correctness invariant:
16 collect_blob_ids_from_deltas(deltas) == collect_blob_ids(repo, [head])
17
18 Performance gate for 1031 commits × 700 files × 5 changed/commit:
19 collect_blob_ids_from_deltas: < 10ms (pure in-memory, no disk I/O)
20 _build_snapshot_deltas: < 500ms (one disk read per snapshot)
21
22 Dimensions match the real musehub repo.
23 """
24 from __future__ import annotations
25
26 import datetime
27 import pathlib
28 import time
29
30 import pytest
31
32 from muse.core.object_store import write_object
33 from muse.core.mpack import (
34 _build_snapshot_deltas,
35 collect_blob_ids,
36 collect_blob_ids_from_deltas,
37 )
38 from muse.core.paths import muse_dir
39 from muse.core.ids import hash_commit as compute_commit_id, hash_snapshot as compute_snapshot_id
40 from muse.core.graph import iter_ancestors
41 from muse.core.refs import write_branch_ref
42 from muse.core.commits import (
43 CommitRecord,
44 read_commit,
45 write_commit,
46 )
47 from muse.core.snapshots import (
48 SnapshotRecord,
49 write_snapshot,
50 )
51 from muse.core.types import blob_id
52
53
54 # ---------------------------------------------------------------------------
55 # Dimensions — match real musehub repo
56 # ---------------------------------------------------------------------------
57
58 _N_FILES = 700
59 _N_COMMITS = 1_031
60 _FILES_CHANGED = 5
61 _BLOB_SIZE = 512
62
63
64 # ---------------------------------------------------------------------------
65 # Repo builder
66 # ---------------------------------------------------------------------------
67
68 def _make_repo(tmp: pathlib.Path) -> tuple[pathlib.Path, str]:
69 """Return (repo_root, head_commit_id)."""
70 tmp.mkdir(parents=True, exist_ok=True)
71 dot = muse_dir(tmp)
72 dot.mkdir()
73 (dot / "repo.json").write_text('{"repo_id":"snap-load-test","owner":"gabriel"}')
74 for d in ("commits", "snapshots", "objects"):
75 (dot / d).mkdir()
76 (dot / "refs" / "heads").mkdir(parents=True)
77 (dot / "HEAD").write_text("ref: refs/heads/main\n")
78 (dot / "config.toml").write_text("")
79
80 blob_ids: list[str] = []
81 for i in range(_N_FILES):
82 data = f"base-{i:06d}".encode() + b"x" * _BLOB_SIZE
83 oid = blob_id(data)
84 write_object(tmp, oid, data)
85 blob_ids.append(oid)
86
87 base_manifest: dict[str, str] = {
88 f"src/file_{i:04d}.py": blob_ids[i] for i in range(_N_FILES)
89 }
90
91 parent: str | None = None
92 tip = ""
93 ts = datetime.datetime(2026, 1, 1, tzinfo=datetime.timezone.utc)
94
95 for i in range(_N_COMMITS):
96 manifest = dict(base_manifest)
97 for j in range(_FILES_CHANGED):
98 idx = (i * _FILES_CHANGED + j) % _N_FILES
99 variant = f"commit-{i:05d}-file-{j}".encode() + b"y" * _BLOB_SIZE
100 variant_oid = blob_id(variant)
101 write_object(tmp, variant_oid, variant)
102 manifest[f"src/file_{idx:04d}.py"] = variant_oid
103
104 sid = compute_snapshot_id(manifest)
105 write_snapshot(tmp, SnapshotRecord(snapshot_id=sid, manifest=manifest))
106
107 msg = f"commit-{i:05d}"
108 cid = compute_commit_id(
109 parent_ids=[parent] if parent else [],
110 snapshot_id=sid,
111 message=msg,
112 committed_at_iso=ts.isoformat(),
113 author="gabriel",
114 )
115 write_commit(tmp, CommitRecord(
116 commit_id=cid,
117 branch="main",
118 snapshot_id=sid,
119 message=msg,
120 committed_at=ts,
121 parent_commit_id=parent,
122 parent2_commit_id=None,
123 author="gabriel",
124 metadata={},
125 structured_delta=None,
126 sem_ver_bump="none",
127 breaking_changes=[],
128 agent_id="", model_id="", toolchain_id="",
129 prompt_hash="", signature="", signer_key_id="",
130 ))
131 parent = cid
132 tip = cid
133 ts += datetime.timedelta(seconds=60)
134
135 write_branch_ref(tmp, "main", tip)
136 return tmp, tip
137
138
139 # ---------------------------------------------------------------------------
140 # Tests
141 # ---------------------------------------------------------------------------
142
143 def test_collect_blob_ids_from_deltas_correctness(tmp_path: pathlib.Path) -> None:
144 """collect_blob_ids_from_deltas returns the same set as collect_blob_ids.
145
146 The delta-based method must be a drop-in replacement for the manifest-scan
147 method: same objects, same count. This is the correctness invariant.
148 """
149 repo, head = _make_repo(tmp_path / "repo")
150
151 commits_oldest_first = list(reversed(list(iter_ancestors(repo, [head]))))
152 deltas = _build_snapshot_deltas(repo, commits_oldest_first)
153
154 ids_from_deltas = set(collect_blob_ids_from_deltas(deltas))
155 ids_from_manifests = set(collect_blob_ids(repo, [head]))
156
157 assert ids_from_deltas == ids_from_manifests, (
158 f"delta method returned {len(ids_from_deltas)} IDs, "
159 f"manifest method returned {len(ids_from_manifests)} IDs\n"
160 f" only in deltas: {len(ids_from_deltas - ids_from_manifests)}\n"
161 f" only in manifests: {len(ids_from_manifests - ids_from_deltas)}"
162 )
163
164
165 def test_collect_blob_ids_from_deltas_is_fast(tmp_path: pathlib.Path) -> None:
166 """collect_blob_ids_from_deltas must run in < 10ms — it is pure in-memory.
167
168 The deltas are already computed (one disk read per snapshot in
169 _build_snapshot_deltas). Extracting oids from delta_upsert.values() is
170 pure dict iteration — no disk I/O, no hashing, no sorting beyond the
171 final dedup.
172
173 This gate proves we are not sneaking any per-delta disk reads back in.
174 """
175 repo, head = _make_repo(tmp_path / "repo")
176
177 commits_oldest_first = list(reversed(list(iter_ancestors(repo, [head]))))
178 deltas = _build_snapshot_deltas(repo, commits_oldest_first)
179
180 t0 = time.perf_counter()
181 ids = collect_blob_ids_from_deltas(deltas)
182 elapsed_ms = (time.perf_counter() - t0) * 1000
183
184 assert elapsed_ms < 10, (
185 f"collect_blob_ids_from_deltas took {elapsed_ms:.1f}ms — "
186 f"expected < 10ms (pure in-memory, no disk I/O)\n"
187 f" {len(ids)} object IDs from {len(deltas)} deltas"
188 )
189
190 print(
191 f"\n {_N_COMMITS} commits × {_N_FILES} files × {_FILES_CHANGED} changed\n"
192 f" delta count: {len(deltas)}\n"
193 f" object count: {len(ids)}\n"
194 f" elapsed: {elapsed_ms:.2f}ms"
195 )
196
197
198 def test_build_snapshot_deltas_single_pass(tmp_path: pathlib.Path) -> None:
199 """_build_snapshot_deltas reads each snapshot exactly once.
200
201 Proves that the authoritative manifest pass is O(N) disk reads, not O(N×F).
202 After this pass, all subsequent operations (collect_blob_ids_from_deltas,
203 build_mpack_from_walk) must derive their data from the delta list — no
204 additional snapshot reads.
205 """
206 repo, head = _make_repo(tmp_path / "repo")
207
208 read_count = 0
209 from muse.core.snapshots import read_snapshot as original_read_snapshot
210 import muse.core.mpack as _pack
211
212 calls: list[str] = []
213
214 def counting_read_snapshot(root: pathlib.Path, sid: str) -> "SnapshotRecord | None":
215 calls.append(sid)
216 return original_read_snapshot(root, sid)
217
218 _pack.read_snapshot = counting_read_snapshot
219 try:
220 commits_oldest_first = list(reversed(list(iter_ancestors(repo, [head]))))
221 deltas = _build_snapshot_deltas(repo, commits_oldest_first)
222 unique_reads = len(set(calls))
223 finally:
224 _pack.read_snapshot = original_read_snapshot
225
226 assert unique_reads == _N_COMMITS, (
227 f"_build_snapshot_deltas read {unique_reads} unique snapshots, "
228 f"expected exactly {_N_COMMITS}"
229 )
230 assert len(calls) == _N_COMMITS, (
231 f"_build_snapshot_deltas called read_snapshot {len(calls)} times, "
232 f"expected exactly {_N_COMMITS} (one per commit, no duplicates)"
233 )
File History 6 commits
sha256:81ae324db5ad375fbfe4834c6fcb378312cafad3cc92dec5d3e5c427306621a2 fix: remove commit_exists filter from have anchors — server… Sonnet 4.6 patch 20 days ago
sha256:36c3cb3e76619d4c30a6d9bf81b5ec4ff148e30dcfed913e3114ca7b43b81c7e fix: rename objects→blobs in push client and all stale test… Sonnet 4.6 patch 22 days ago
sha256:0313c134f0ef4518a9c3a0ec359ffdc42546dc720010730374edfe0857caf7ef rename: delta_add → delta_upsert across wire format, source… Sonnet 4.6 minor 22 days ago
sha256:fb19dc03703eb3fc11d016ea19f619eebfab7bde2acf247346dc0f032e65ff19 fix(push): step 0 log shows full /refs URL instead of misle… Sonnet 4.6 patch 22 days ago
sha256:c06a9b9b9fee26c68ea725b44d54b2c0a171301ce9de746d5b656617b4463a9a fix: repair four test failures from post-migration audit Sonnet 4.6 patch 28 days ago
sha256:1900655993c83c4107067375548a7be823e471d2515830842f1a12cba4bd3cdf fix: unified object store migration — idempotent writes, JS… Sonnet 4.6 minor 28 days ago