gabriel/muse — blame/sha256:8/test_push_snapshot_loading.py

1 files

1 commits

0 hotspots

0 🧊 dead

0 💥 blast risk

sha256:4 Merge branch 'dev' into main · gabriel · Jun 17, 2026

1	"""TDD — mpack push path must not read snapshot manifests redundantly.
2
3	Today the mpack path reads 718K manifest entries three times:
4	1. snapshots_list loading — full manifests, redundant on the mpack path
5	2. collect_blob_ids — full manifests, used only for progress count
6	3. build_mpack/_build_snapshot_deltas — full manifests (the needed pass)
7
8	Passes 1 and 2 are dead work on the mpack path.
9
10	The fix: a new function collect_blob_ids_from_deltas() extracts object IDs
11	directly from the already-computed delta list — zero additional disk reads.
12	The mpack push path calls _build_snapshot_deltas once, then derives both
13	the wire delta list AND the object ID set from that single pass.
14
15	Correctness invariant:
16	collect_blob_ids_from_deltas(deltas) == collect_blob_ids(repo, [head])
17
18	Performance gate for 1031 commits × 700 files × 5 changed/commit:
19	collect_blob_ids_from_deltas: < 10ms (pure in-memory, no disk I/O)
20	_build_snapshot_deltas: < 500ms (one disk read per snapshot)
21
22	Dimensions match the real musehub repo.
23	"""
24	from __future__ import annotations
25
26	import datetime
27	import pathlib
28	import time
29
30	import pytest
31
32	from muse.core.object_store import write_object
33	from muse.core.mpack import (
34	_build_snapshot_deltas,
35	collect_blob_ids,
36	collect_blob_ids_from_deltas,
37	)
38	from muse.core.paths import muse_dir
39	from muse.core.ids import hash_commit as compute_commit_id, hash_snapshot as compute_snapshot_id
40	from muse.core.graph import iter_ancestors
41	from muse.core.refs import write_branch_ref
42	from muse.core.commits import (
43	CommitRecord,
44	read_commit,
45	write_commit,
46	)
47	from muse.core.snapshots import (
48	SnapshotRecord,
49	write_snapshot,
50	)
51	from muse.core.types import blob_id
52
53
54	# ---------------------------------------------------------------------------
55	# Dimensions — match real musehub repo
56	# ---------------------------------------------------------------------------
57
58	_N_FILES = 700
59	_N_COMMITS = 1_031
60	_FILES_CHANGED = 5
61	_BLOB_SIZE = 512
62
63
64	# ---------------------------------------------------------------------------
65	# Repo builder
66	# ---------------------------------------------------------------------------
67
68	def _make_repo(tmp: pathlib.Path) -> tuple[pathlib.Path, str]:
69	"""Return (repo_root, head_commit_id)."""
70	tmp.mkdir(parents=True, exist_ok=True)
71	dot = muse_dir(tmp)
72	dot.mkdir()
73	(dot / "repo.json").write_text('{"repo_id":"snap-load-test","owner":"gabriel"}')
74	for d in ("commits", "snapshots", "objects"):
75	(dot / d).mkdir()
76	(dot / "refs" / "heads").mkdir(parents=True)
77	(dot / "HEAD").write_text("ref: refs/heads/main\n")
78	(dot / "config.toml").write_text("")
79
80	blob_ids: list[str] = []
81	for i in range(_N_FILES):
82	data = f"base-{i:06d}".encode() + b"x" * _BLOB_SIZE
83	oid = blob_id(data)
84	write_object(tmp, oid, data)
85	blob_ids.append(oid)
86
87	base_manifest: dict[str, str] = {
88	f"src/file_{i:04d}.py": blob_ids[i] for i in range(_N_FILES)
89	}
90
91	parent: str \| None = None
92	tip = ""
93	ts = datetime.datetime(2026, 1, 1, tzinfo=datetime.timezone.utc)
94
95	for i in range(_N_COMMITS):
96	manifest = dict(base_manifest)
97	for j in range(_FILES_CHANGED):
98	idx = (i * _FILES_CHANGED + j) % _N_FILES
99	variant = f"commit-{i:05d}-file-{j}".encode() + b"y" * _BLOB_SIZE
100	variant_oid = blob_id(variant)
101	write_object(tmp, variant_oid, variant)
102	manifest[f"src/file_{idx:04d}.py"] = variant_oid
103
104	sid = compute_snapshot_id(manifest)
105	write_snapshot(tmp, SnapshotRecord(snapshot_id=sid, manifest=manifest))
106
107	msg = f"commit-{i:05d}"
108	cid = compute_commit_id(
109	parent_ids=[parent] if parent else [],
110	snapshot_id=sid,
111	message=msg,
112	committed_at_iso=ts.isoformat(),
113	author="gabriel",
114	)
115	write_commit(tmp, CommitRecord(
116	commit_id=cid,
117	branch="main",
118	snapshot_id=sid,
119	message=msg,
120	committed_at=ts,
121	parent_commit_id=parent,
122	parent2_commit_id=None,
123	author="gabriel",
124	metadata={},
125	structured_delta=None,
126	sem_ver_bump="none",
127	breaking_changes=[],
128	agent_id="", model_id="", toolchain_id="",
129	prompt_hash="", signature="", signer_key_id="",
130	))
131	parent = cid
132	tip = cid
133	ts += datetime.timedelta(seconds=60)
134
135	write_branch_ref(tmp, "main", tip)
136	return tmp, tip
137
138
139	# ---------------------------------------------------------------------------
140	# Tests
141	# ---------------------------------------------------------------------------
142
143	def test_collect_blob_ids_from_deltas_correctness(tmp_path: pathlib.Path) -> None:
144	"""collect_blob_ids_from_deltas returns the same set as collect_blob_ids.
145
146	The delta-based method must be a drop-in replacement for the manifest-scan
147	method: same objects, same count. This is the correctness invariant.
148	"""
149	repo, head = _make_repo(tmp_path / "repo")
150
151	commits_oldest_first = list(reversed(list(iter_ancestors(repo, [head]))))
152	deltas = _build_snapshot_deltas(repo, commits_oldest_first)
153
154	ids_from_deltas = set(collect_blob_ids_from_deltas(deltas))
155	ids_from_manifests = set(collect_blob_ids(repo, [head]))
156
157	assert ids_from_deltas == ids_from_manifests, (
158	f"delta method returned {len(ids_from_deltas)} IDs, "
159	f"manifest method returned {len(ids_from_manifests)} IDs\n"
160	f" only in deltas: {len(ids_from_deltas - ids_from_manifests)}\n"
161	f" only in manifests: {len(ids_from_manifests - ids_from_deltas)}"
162	)
163
164
165	def test_collect_blob_ids_from_deltas_is_fast(tmp_path: pathlib.Path) -> None:
166	"""collect_blob_ids_from_deltas must run in < 10ms — it is pure in-memory.
167
168	The deltas are already computed (one disk read per snapshot in
169	_build_snapshot_deltas). Extracting oids from delta_upsert.values() is
170	pure dict iteration — no disk I/O, no hashing, no sorting beyond the
171	final dedup.
172
173	This gate proves we are not sneaking any per-delta disk reads back in.
174	"""
175	repo, head = _make_repo(tmp_path / "repo")
176
177	commits_oldest_first = list(reversed(list(iter_ancestors(repo, [head]))))
178	deltas = _build_snapshot_deltas(repo, commits_oldest_first)
179
180	t0 = time.perf_counter()
181	ids = collect_blob_ids_from_deltas(deltas)
182	elapsed_ms = (time.perf_counter() - t0) * 1000
183
184	assert elapsed_ms < 10, (
185	f"collect_blob_ids_from_deltas took {elapsed_ms:.1f}ms — "
186	f"expected < 10ms (pure in-memory, no disk I/O)\n"
187	f" {len(ids)} object IDs from {len(deltas)} deltas"
188	)
189
190	print(
191	f"\n {_N_COMMITS} commits × {_N_FILES} files × {_FILES_CHANGED} changed\n"
192	f" delta count: {len(deltas)}\n"
193	f" object count: {len(ids)}\n"
194	f" elapsed: {elapsed_ms:.2f}ms"
195	)
196
197
198	def test_build_snapshot_deltas_single_pass(tmp_path: pathlib.Path) -> None:
199	"""_build_snapshot_deltas reads each snapshot exactly once.
200
201	Proves that the authoritative manifest pass is O(N) disk reads, not O(N×F).
202	After this pass, all subsequent operations (collect_blob_ids_from_deltas,
203	build_mpack_from_walk) must derive their data from the delta list — no
204	additional snapshot reads.
205	"""
206	repo, head = _make_repo(tmp_path / "repo")
207
208	read_count = 0
209	from muse.core.snapshots import read_snapshot as original_read_snapshot
210	import muse.core.mpack as _pack
211
212	calls: list[str] = []
213
214	def counting_read_snapshot(root: pathlib.Path, sid: str) -> "SnapshotRecord \| None":
215	calls.append(sid)
216	return original_read_snapshot(root, sid)
217
218	_pack.read_snapshot = counting_read_snapshot
219	try:
220	commits_oldest_first = list(reversed(list(iter_ancestors(repo, [head]))))
221	deltas = _build_snapshot_deltas(repo, commits_oldest_first)
222	unique_reads = len(set(calls))
223	finally:
224	_pack.read_snapshot = original_read_snapshot
225
226	assert unique_reads == _N_COMMITS, (
227	f"_build_snapshot_deltas read {unique_reads} unique snapshots, "
228	f"expected exactly {_N_COMMITS}"
229	)
230	assert len(calls) == _N_COMMITS, (
231	f"_build_snapshot_deltas called read_snapshot {len(calls)} times, "
232	f"expected exactly {_N_COMMITS} (one per commit, no duplicates)"
233	)

test_push_snapshot_loading.py file-level

`test_push_snapshot_loading.py` file-level