"""TDD — MPack snapshot delta format. Guiding principle: content-addressing is a proof, not a label. snapshot_id = sha256(sorted path-NUL-oid pairs) If we hold snapshot_id and a delta from the parent manifest, we reconstruct the full manifest and hash it. If the hash matches snapshot_id, the delta is correct. No external store needed. The math IS the verification. Tests: 1. build_mpack emits SnapshotDeltaDict entries (delta_upsert/delta_remove), never a full manifest blob per snapshot after the first one. 2. Delta chain reconstruction: apply each delta → hash matches snapshot_id. 3. MPack wire size is < 10% of the equivalent full-manifest mpack for a 100-commit chain where each commit changes one file. 4. apply_mpack round-trips delta bundles: snapshots written to local store have the correct full manifest. """ from __future__ import annotations import datetime import hashlib import pathlib import pytest from muse.core.object_store import write_object from muse.core.mpack import MPack, apply_mpack, build_mpack from muse.core.paths import muse_dir from muse.core.ids import hash_snapshot as compute_snapshot_id from muse.core.refs import write_branch_ref from muse.core.commits import ( CommitRecord, write_commit, ) from muse.core.snapshots import ( SnapshotRecord, read_snapshot, write_snapshot, ) from muse.core.types import blob_id _Manifest = dict[str, str] # snapshot manifest: path → blob_id _ManifestMap = dict[str, _Manifest] # snapshot_id → full manifest # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _make_repo(tmp: pathlib.Path) -> pathlib.Path: tmp.mkdir(parents=True, exist_ok=True) dot = muse_dir(tmp) dot.mkdir() (dot / "repo.json").write_text('{"repo_id":"delta-test","owner":"gabriel"}') for d in ("commits", "snapshots", "objects"): (dot / d).mkdir() (dot / "refs" / "heads").mkdir(parents=True) (dot / "HEAD").write_text("ref: refs/heads/main\n") (dot / "config.toml").write_text("") return tmp _N_BASE_FILES = 50 _N_COMMITS = 100 _BLOB_SIZE = 256 def _make_blob(tag: str) -> tuple[str, bytes]: raw = tag.encode() + b"x" * _BLOB_SIZE return blob_id(raw), raw def _populate_chain(repo: pathlib.Path) -> tuple[str, list[str]]: """Create _N_BASE_FILES blobs + _N_COMMITS commits, each changing one file. Returns (head_commit_id, ordered_snapshot_ids_oldest_first). """ base_blobs: dict[str, tuple[str, bytes]] = {} for i in range(_N_BASE_FILES): oid, raw = _make_blob(f"base-{i:04d}") write_object(repo, oid, raw) base_blobs[f"file_{i:04d}.txt"] = (oid, raw) base_manifest = {path: oid for path, (oid, _) in base_blobs.items()} parent: str | None = None tip = "" ts = datetime.datetime(2026, 1, 1, tzinfo=datetime.timezone.utc) snapshot_ids: list[str] = [] for i in range(_N_COMMITS): # Each commit changes exactly one file. new_oid, new_raw = _make_blob(f"commit-{i:05d}-variant") write_object(repo, new_oid, new_raw) manifest = dict(base_manifest) manifest[f"file_{i % _N_BASE_FILES:04d}.txt"] = new_oid sid = compute_snapshot_id(manifest) write_snapshot(repo, SnapshotRecord(snapshot_id=sid, manifest=manifest)) snapshot_ids.append(sid) cid = _make_commit_id(parent, sid, f"c{i:05d}", ts.isoformat()) rec = CommitRecord( commit_id=cid, branch="main", snapshot_id=sid, message=f"c{i:05d}", committed_at=ts, parent_commit_id=parent, parent2_commit_id=None, author="gabriel", metadata={}, structured_delta=None, sem_ver_bump="none", breaking_changes=[], agent_id="", model_id="", toolchain_id="", prompt_hash="", signature="", signer_key_id="", ) write_commit(repo, rec) parent = cid tip = cid ts += datetime.timedelta(seconds=60) write_branch_ref(repo, "main", tip) return tip, snapshot_ids def _make_commit_id(parent: str | None, sid: str, msg: str, ts: str) -> str: from muse.core.ids import hash_commit as compute_commit_id return compute_commit_id( parent_ids=[parent] if parent else [], snapshot_id=sid, message=msg, committed_at_iso=ts, author="gabriel", ) def _reconstruct_from_deltas(mpack: MPack) -> _ManifestMap: """Apply the delta chain and return {snapshot_id: full_manifest}.""" from muse.core.ids import hash_snapshot as csi resolved: _ManifestMap = {} for snap in mpack.get("snapshots") or []: sid = snap["snapshot_id"] parent_sid = snap.get("parent_snapshot_id") delta_upsert: dict[str, str] = snap.get("delta_upsert") or {} delta_remove: list[str] = snap.get("delta_remove") or [] base = dict(resolved[parent_sid]) if parent_sid and parent_sid in resolved else {} base.update(delta_upsert) for path in delta_remove: base.pop(path, None) # The hash IS the proof. assert csi(base) == sid, f"hash mismatch for {sid[:16]}" resolved[sid] = base return resolved # --------------------------------------------------------------------------- # Tests # --------------------------------------------------------------------------- def test_bundle_snapshots_are_deltas(tmp_path: pathlib.Path) -> None: """build_mpack emits snapshot deltas, not full manifests.""" repo = _make_repo(tmp_path / "repo") head, _ = _populate_chain(repo) mpack = build_mpack(repo, [head], have=[]) snaps = mpack.get("snapshots") or [] assert len(snaps) == _N_COMMITS, f"expected {_N_COMMITS} snapshots, got {len(snaps)}" for snap in snaps: assert "delta_upsert" in snap, f"missing delta_upsert in snapshot {snap.get('snapshot_id', '?')[:16]}" assert "delta_remove" in snap, f"missing delta_remove" assert "manifest" not in snap, "full manifest must not be present — delta format only" def test_delta_reconstruction_proves_snapshot_id(tmp_path: pathlib.Path) -> None: """Applying each delta and hashing the result must equal snapshot_id.""" repo = _make_repo(tmp_path / "repo") head, snapshot_ids = _populate_chain(repo) mpack = build_mpack(repo, [head], have=[]) # Will assert inside _reconstruct_from_deltas if any hash mismatches. resolved = _reconstruct_from_deltas(mpack) assert set(resolved.keys()) == set(snapshot_ids), "not all snapshots resolved" def test_only_first_snapshot_has_full_manifest(tmp_path: pathlib.Path) -> None: """All snapshots after the first should have delta_upsert < full manifest size.""" repo = _make_repo(tmp_path / "repo") head, _ = _populate_chain(repo) mpack = build_mpack(repo, [head], have=[]) snaps = mpack.get("snapshots") or [] # First snapshot: delta_upsert == full manifest (no parent), so len == N_BASE_FILES. assert len(snaps[0].get("delta_upsert", {})) == _N_BASE_FILES # All subsequent snapshots change exactly one file → delta_upsert has 1 or 2 entries # (1 add + maybe 1 implicit change if same path reverted). for snap in snaps[1:]: n_add = len(snap.get("delta_upsert", {})) assert n_add < _N_BASE_FILES, ( f"snapshot {snap['snapshot_id'][:16]} delta_upsert has {n_add} entries — " f"should be a small delta, not a full manifest copy" ) def test_delta_bundle_smaller_than_full_manifest(tmp_path: pathlib.Path) -> None: """Delta mpack wire bytes must be < 10% of a hypothetical full-manifest mpack.""" import msgpack repo = _make_repo(tmp_path / "repo") head, snapshot_ids = _populate_chain(repo) delta_bundle = build_mpack(repo, [head], have=[]) delta_bytes = len(msgpack.packb(delta_bundle, use_bin_type=True)) # Build a synthetic "full manifest" mpack for size comparison. full_snap_size = sum( len(msgpack.packb({ "snapshot_id": sid, "manifest": (read_snapshot(repo, sid) or SnapshotRecord(snapshot_id=sid, manifest={})).manifest, }, use_bin_type=True)) for sid in snapshot_ids ) delta_snap_size = sum( len(msgpack.packb(snap, use_bin_type=True)) for snap in (delta_bundle.get("snapshots") or []) ) ratio = delta_snap_size / full_snap_size assert ratio < 0.10, ( f"Delta snapshots are {ratio:.1%} of full-manifest size — expected < 10%.\n" f" delta_snap_bytes={delta_snap_size} full_snap_bytes={full_snap_size}" ) _ = delta_bytes # measured; useful for manual inspection def test_apply_mpack_reconstructs_snapshots_from_deltas(tmp_path: pathlib.Path) -> None: """apply_mpack writes correct full SnapshotRecords from delta bundles.""" src = _make_repo(tmp_path / "src") head, snapshot_ids = _populate_chain(src) mpack = build_mpack(src, [head], have=[]) dst = _make_repo(tmp_path / "dst") result = apply_mpack(dst, mpack) assert result["snapshots_written"] == _N_COMMITS # Every snapshot in dst must have the full correct manifest. for sid in snapshot_ids: snap = read_snapshot(dst, sid) assert snap is not None, f"snapshot {sid[:16]} not written to dst" assert compute_snapshot_id(snap.manifest) == sid, ( f"manifest hash mismatch for {sid[:16]}: " f"compute_snapshot_id gives {compute_snapshot_id(snap.manifest)[:16]}" )