gabriel / muse public
test_bundle_object_compression.py python
204 lines 6.9 KB
Raw
sha256:36c3cb3e76619d4c30a6d9bf81b5ec4ff148e30dcfed913e3114ca7b43b81c7e fix: rename objects→blobs in push client and all stale test… Sonnet 4.6 patch 23 days ago
1 """TDD — bundle objects must be zstd-compressed on the bundle push path.
2
3 Git compresses pack objects in C (libxdiff/zlib). Our custom Python delta loop
4 was O(n) per blob in pure Python — 18s for 5162 real-world objects.
5
6 The fix: use zstandard (C extension) for per-object compression. No custom
7 delta loop. One C call per object.
8
9 Three invariants:
10
11 1. Wire size with compression < 40% of raw payload.
12 zstd on Python-like text achieves ~4-6x ratio easily.
13
14 2. Every compressed object reconstructs to bytes matching its object_id.
15 sha256(zstd.decompress(content)) == object_id. This is the correctness
16 proof — content-addressing is the verification.
17
18 3. build_mpack_from_walk(compress=True) completes in < 1s for
19 200 commits × 100 files × 5 changed/commit.
20 This is the speed gate — C extension, not Python loop.
21
22 Repo dimensions: 200 commits × 100 files × 5 changed/commit, BLOB_SIZE=1024.
23 """
24 from __future__ import annotations
25
26 import datetime
27 import pathlib
28 import time
29
30 import pytest
31 import zstandard
32
33 from muse.core.object_store import write_object
34 from muse.core.mpack import build_mpack_from_walk, walk_commits
35 from muse.core.paths import muse_dir
36 from muse.core.ids import hash_commit, hash_snapshot
37 from muse.core.refs import write_branch_ref
38 from muse.core.commits import (
39 CommitRecord,
40 write_commit,
41 )
42 from muse.core.snapshots import (
43 SnapshotRecord,
44 write_snapshot,
45 )
46 from muse.core.types import blob_id
47
48
49 # ---------------------------------------------------------------------------
50 # Dimensions
51 # ---------------------------------------------------------------------------
52
53 _N_FILES = 100
54 _N_COMMITS = 200
55 _FILES_CHANGED = 5
56 _BLOB_SIZE = 1024
57
58
59 # ---------------------------------------------------------------------------
60 # Repo builder
61 # ---------------------------------------------------------------------------
62
63 def _make_repo(tmp: pathlib.Path) -> tuple[pathlib.Path, str]:
64 tmp.mkdir(parents=True, exist_ok=True)
65 dot = muse_dir(tmp)
66 dot.mkdir()
67 (dot / "repo.json").write_text('{"repo_id":"compress-test","owner":"gabriel"}')
68 for d in ("commits", "snapshots", "objects"):
69 (dot / d).mkdir()
70 (dot / "refs" / "heads").mkdir(parents=True)
71 (dot / "HEAD").write_text("ref: refs/heads/main\n")
72 (dot / "config.toml").write_text("")
73
74 blob_ids: list[str] = []
75 for i in range(_N_FILES):
76 data = (f"# file {i:04d}\n" + f"x = {i}\n" * (_BLOB_SIZE // 8)).encode()
77 oid = blob_id(data)
78 write_object(tmp, oid, data)
79 blob_ids.append(oid)
80
81 base_manifest: dict[str, str] = {
82 f"src/file_{i:04d}.py": blob_ids[i] for i in range(_N_FILES)
83 }
84
85 parent: str | None = None
86 tip = ""
87 ts = datetime.datetime(2026, 1, 1, tzinfo=datetime.timezone.utc)
88
89 for i in range(_N_COMMITS):
90 manifest = dict(base_manifest)
91 for j in range(_FILES_CHANGED):
92 idx = (i * _FILES_CHANGED + j) % _N_FILES
93 data = (
94 f"# file {idx:04d}\n"
95 + f"x = {idx}\n" * (_BLOB_SIZE // 8)
96 + f"# commit {i:05d}\n"
97 ).encode()
98 oid = blob_id(data)
99 write_object(tmp, oid, data)
100 manifest[f"src/file_{idx:04d}.py"] = oid
101
102 sid = hash_snapshot(manifest)
103 write_snapshot(tmp, SnapshotRecord(snapshot_id=sid, manifest=manifest))
104
105 msg = f"commit-{i:05d}"
106 cid = hash_commit(
107 parent_ids=[parent] if parent else [],
108 snapshot_id=sid,
109 message=msg,
110 committed_at_iso=ts.isoformat(),
111 author="gabriel",
112 )
113 write_commit(tmp, CommitRecord(
114 commit_id=cid,
115 branch="main",
116 snapshot_id=sid,
117 message=msg,
118 committed_at=ts,
119 parent_commit_id=parent,
120 parent2_commit_id=None,
121 author="gabriel",
122 metadata={},
123 structured_delta=None,
124 sem_ver_bump="none",
125 breaking_changes=[],
126 agent_id="", model_id="", toolchain_id="",
127 prompt_hash="", signature="", signer_key_id="",
128 ))
129 parent = cid
130 tip = cid
131 ts += datetime.timedelta(seconds=60)
132
133 write_branch_ref(tmp, "main", tip)
134 return tmp, tip
135
136
137 # ---------------------------------------------------------------------------
138 # Tests
139 # ---------------------------------------------------------------------------
140
141 def test_bundle_compression_reduces_wire_size(tmp_path: pathlib.Path) -> None:
142 """Compressed payload must be < 40% of raw — zstd on text achieves 4-6x."""
143 repo, head = _make_repo(tmp_path / "repo")
144 walk = walk_commits(repo, [head])
145
146 bundle_raw = build_mpack_from_walk(repo, walk)
147 bundle_comp = build_mpack_from_walk(repo, walk, compress=True)
148
149 raw_bytes = sum(len(o.get("content") or b"") for o in bundle_raw["blobs"])
150 comp_bytes = sum(len(o.get("content") or b"") for o in bundle_comp["blobs"])
151
152 ratio = comp_bytes / raw_bytes if raw_bytes else 1.0
153 assert ratio < 0.40, (
154 f"compressed payload is {ratio:.1%} of raw — expected < 40%\n"
155 f" raw: {raw_bytes:,} bytes\n"
156 f" compressed: {comp_bytes:,} bytes\n"
157 f" blobs: {len(bundle_comp['blobs'])}"
158 )
159
160
161 def test_bundle_compressed_objects_reconstructable(tmp_path: pathlib.Path) -> None:
162 """sha256(zstd.decompress(content)) == object_id for every compressed object.
163
164 Content-addressing is the proof — no external verification needed.
165 """
166 repo, head = _make_repo(tmp_path / "repo")
167 walk = walk_commits(repo, [head])
168 bundle = build_mpack_from_walk(repo, walk, compress=True)
169
170 dctx = zstandard.ZstdDecompressor()
171 for obj in bundle["blobs"]:
172 enc = obj.get("encoding", "raw")
173 content: bytes = obj.get("content") or b""
174 oid: str = obj["object_id"]
175
176 if enc == "raw":
177 assert blob_id(content) == oid
178 elif enc == "zstd":
179 raw = dctx.decompress(content)
180 assert blob_id(raw) == oid, f"zstd object {oid[:16]}… sha256 mismatch"
181 else:
182 pytest.fail(f"unexpected encoding {enc!r} on object {oid[:16]}…")
183
184
185 def test_bundle_compression_is_fast(tmp_path: pathlib.Path) -> None:
186 """build_mpack_from_walk(compress=True) must complete in < 1s.
187
188 zstd is a C extension — one call per object, no Python loop.
189 This gate proves we are not running a pure-Python compression loop.
190 """
191 repo, head = _make_repo(tmp_path / "repo")
192 walk = walk_commits(repo, [head])
193
194 t0 = time.perf_counter()
195 bundle = build_mpack_from_walk(repo, walk, compress=True)
196 elapsed = time.perf_counter() - t0
197
198 n_blobs = len(bundle["blobs"])
199 assert elapsed < 1.0, (
200 f"build_mpack_from_walk(compress=True) took {elapsed:.2f}s — expected < 1s\n"
201 f" {_N_COMMITS} commits × {_N_FILES} files × {_FILES_CHANGED} changed\n"
202 f" {n_blobs} blobs"
203 )
204 print(f"\n {n_blobs} blobs compressed in {elapsed*1000:.0f}ms")
File History 1 commit
sha256:36c3cb3e76619d4c30a6d9bf81b5ec4ff148e30dcfed913e3114ca7b43b81c7e fix: rename objects→blobs in push client and all stale test… Sonnet 4.6 patch 23 days ago