test_compression.py
python
sha256:248464b6a2f758985cbef90f864fa62c61842be699d975d6e00b6a9509ef919c
fix(delta): detect blob-identical file renames for files wi…
Sonnet 4.6
patch
23 days ago
| 1 | """Unit tests for muse.core.compression — zlib (Tier 1) and delta (Tier 2).""" |
| 2 | |
| 3 | from __future__ import annotations |
| 4 | |
| 5 | import zlib |
| 6 | |
| 7 | import pytest |
| 8 | |
| 9 | from muse.core.compression import ( |
| 10 | _COPY_THRESHOLD, |
| 11 | apply_delta, |
| 12 | compress_zlib, |
| 13 | compute_delta, |
| 14 | decompress_zlib, |
| 15 | ) |
| 16 | |
| 17 | |
| 18 | # --------------------------------------------------------------------------- |
| 19 | # Tier 1 — zlib round-trip |
| 20 | # --------------------------------------------------------------------------- |
| 21 | |
| 22 | |
| 23 | class TestZlib: |
| 24 | def test_round_trip_source_code(self) -> None: |
| 25 | """Compressing and decompressing Python source returns identical bytes.""" |
| 26 | data = b"def foo(x):\n return x * 2\n" * 100 |
| 27 | assert decompress_zlib(compress_zlib(data)) == data |
| 28 | |
| 29 | def test_round_trip_binary(self) -> None: |
| 30 | """Binary data round-trips correctly.""" |
| 31 | data = bytes(range(256)) * 128 |
| 32 | assert decompress_zlib(compress_zlib(data)) == data |
| 33 | |
| 34 | def test_round_trip_empty(self) -> None: |
| 35 | """Empty bytes compress and decompress cleanly.""" |
| 36 | assert decompress_zlib(compress_zlib(b"")) == b"" |
| 37 | |
| 38 | def test_compress_reduces_size_for_text(self) -> None: |
| 39 | """Repetitive text compresses to fewer bytes.""" |
| 40 | data = b"hello world " * 1000 |
| 41 | assert len(compress_zlib(data)) < len(data) |
| 42 | |
| 43 | def test_decompress_corrupt_raises(self) -> None: |
| 44 | """Corrupt input to decompress_zlib raises zlib.error.""" |
| 45 | with pytest.raises(zlib.error): |
| 46 | decompress_zlib(b"not zlib data") |
| 47 | |
| 48 | def test_compress_output_is_valid_zlib(self) -> None: |
| 49 | """compress_zlib output is valid zlib (decompressible by the stdlib directly).""" |
| 50 | data = b"test payload" |
| 51 | compressed = compress_zlib(data) |
| 52 | assert zlib.decompress(compressed) == data |
| 53 | |
| 54 | |
| 55 | # --------------------------------------------------------------------------- |
| 56 | # Tier 2 — delta round-trip |
| 57 | # --------------------------------------------------------------------------- |
| 58 | |
| 59 | |
| 60 | class TestDelta: |
| 61 | def test_round_trip_identical_content(self) -> None: |
| 62 | """Delta of a file against itself reconstructs the original.""" |
| 63 | data = b"unchanged content" * 50 |
| 64 | delta = compute_delta(data, data) |
| 65 | assert apply_delta(data, delta) == data |
| 66 | |
| 67 | def test_round_trip_small_edit(self) -> None: |
| 68 | """A one-byte change in a large file round-trips correctly.""" |
| 69 | base = b"the quick brown fox jumps over the lazy dog" * 100 |
| 70 | target = base[:500] + b"X" + base[501:] |
| 71 | delta = compute_delta(base, target) |
| 72 | assert apply_delta(base, delta) == target |
| 73 | |
| 74 | def test_round_trip_append(self) -> None: |
| 75 | """Appending bytes to a file round-trips correctly.""" |
| 76 | base = b"existing content\n" * 20 |
| 77 | target = base + b"new line appended\n" |
| 78 | delta = compute_delta(base, target) |
| 79 | assert apply_delta(base, delta) == target |
| 80 | |
| 81 | def test_round_trip_prepend(self) -> None: |
| 82 | """Prepending bytes to a file round-trips correctly.""" |
| 83 | base = b"existing content\n" * 20 |
| 84 | target = b"new header\n" + base |
| 85 | delta = compute_delta(base, target) |
| 86 | assert apply_delta(base, delta) == target |
| 87 | |
| 88 | def test_apply_delta_pure_data_instruction(self) -> None: |
| 89 | """apply_delta correctly reconstructs target from a pure DATA instruction stream. |
| 90 | |
| 91 | compute_delta raises ValueError when DATA-only deltas aren't smaller than |
| 92 | plain zlib (completely different content has no COPY opportunities and DATA |
| 93 | framing adds overhead). We verify the decoder directly with a hand-built stream. |
| 94 | """ |
| 95 | import struct |
| 96 | target = b"completely different from base " * 20 |
| 97 | stream = b"\x01" + struct.pack(">I", len(target)) + target |
| 98 | delta = zlib.compress(stream, level=1) |
| 99 | base = b"unrelated source material " * 20 |
| 100 | assert apply_delta(base, delta) == target |
| 101 | |
| 102 | def test_round_trip_empty_base(self) -> None: |
| 103 | """Empty base produces a pure DATA delta that reconstructs target.""" |
| 104 | base = b"" |
| 105 | target = b"new file content" |
| 106 | delta = compute_delta(base, target) |
| 107 | assert apply_delta(base, delta) == target |
| 108 | |
| 109 | def test_apply_delta_empty_target(self) -> None: |
| 110 | """apply_delta with an empty instruction stream produces empty bytes. |
| 111 | |
| 112 | An empty target has no delta instructions (nothing to emit), so the |
| 113 | compressed stream is just zlib(b"") — the same size as compress_zlib(b""). |
| 114 | compute_delta raises ValueError for this case (not profitable). We verify |
| 115 | the decoder directly: an empty instruction stream reconstructs b"". |
| 116 | """ |
| 117 | delta = zlib.compress(b"", level=1) |
| 118 | base = b"some existing content" |
| 119 | assert apply_delta(base, delta) == b"" |
| 120 | |
| 121 | def test_round_trip_source_file(self) -> None: |
| 122 | """Simulated source-file edit: add a function at the end.""" |
| 123 | base = ( |
| 124 | b"def foo():\n pass\n\n" |
| 125 | b"def bar():\n return 1\n\n" |
| 126 | ) * 30 |
| 127 | target = base + b"def baz():\n return 2\n" |
| 128 | delta = compute_delta(base, target) |
| 129 | assert apply_delta(base, delta) == target |
| 130 | |
| 131 | def test_delta_smaller_than_zlib_for_small_edit(self) -> None: |
| 132 | """Delta should be smaller than plain zlib for a small edit in a large file.""" |
| 133 | base = b"stable content\n" * 500 |
| 134 | target = base[:1000] + b"changed line\n" + base[1013:] |
| 135 | delta = compute_delta(base, target) |
| 136 | plain = compress_zlib(target) |
| 137 | assert len(delta) < len(plain), ( |
| 138 | f"Expected delta ({len(delta)}) < zlib ({len(plain)})" |
| 139 | ) |
| 140 | |
| 141 | def test_unprofitable_delta_raises_value_error(self) -> None: |
| 142 | """compute_delta raises ValueError when delta >= zlib(target).""" |
| 143 | # Completely random-looking data has no copy opportunities and the |
| 144 | # overhead of the delta format makes it larger than plain zlib. |
| 145 | import os |
| 146 | base = os.urandom(64) |
| 147 | target = os.urandom(64) |
| 148 | with pytest.raises(ValueError, match="not profitable"): |
| 149 | compute_delta(base, target) |
| 150 | |
| 151 | def test_apply_delta_corrupt_raises(self) -> None: |
| 152 | """apply_delta raises zlib.error on corrupt compressed input.""" |
| 153 | with pytest.raises(zlib.error): |
| 154 | apply_delta(b"base", b"not zlib") |
| 155 | |
| 156 | def test_apply_delta_unknown_instruction_raises(self) -> None: |
| 157 | """apply_delta raises ValueError on an unknown instruction byte.""" |
| 158 | import struct |
| 159 | # Craft a stream with an invalid instruction byte (0x99). |
| 160 | bad_stream = zlib.compress(b"\x99" + struct.pack(">I", 0)) |
| 161 | with pytest.raises(ValueError, match="unknown delta instruction type"): |
| 162 | apply_delta(b"base", bad_stream) |
| 163 | |
| 164 | def test_copy_threshold_respected(self) -> None: |
| 165 | """Only runs of >= _COPY_THRESHOLD bytes produce COPY instructions. |
| 166 | |
| 167 | Small data (< _COPY_THRESHOLD bytes) has no 32-byte hash windows, so |
| 168 | compute_delta emits DATA and the delta is not profitable (overhead > savings). |
| 169 | We verify the decoder handles a hand-built mixed COPY+DATA stream where |
| 170 | a short tail that was below threshold is encoded as DATA. |
| 171 | """ |
| 172 | import struct |
| 173 | # Large stable prefix — would produce COPY in a real delta. |
| 174 | stable = b"the quick brown fox jumps over the lazy dog\n" * 40 # 1760 bytes |
| 175 | # Short unique tail shorter than _COPY_THRESHOLD. |
| 176 | short_tail = b"Z" * (_COPY_THRESHOLD - 1) # 31 bytes |
| 177 | |
| 178 | base = stable + short_tail |
| 179 | target = stable + short_tail # same — round-trip check |
| 180 | |
| 181 | # Build a hand-crafted delta: |
| 182 | # COPY the entire base (COPY instruction is valid since base == target) |
| 183 | # …but simulate the DATA path by encoding just the tail as DATA. |
| 184 | # Build as: COPY(0, len(stable)) then DATA(short_tail) |
| 185 | copy_instr = b"\x00" + struct.pack(">II", 0, len(stable)) |
| 186 | data_instr = b"\x01" + struct.pack(">I", len(short_tail)) + short_tail |
| 187 | stream = copy_instr + data_instr |
| 188 | delta = zlib.compress(stream, level=1) |
| 189 | |
| 190 | result = apply_delta(base, delta) |
| 191 | assert result == target |
| 192 | |
| 193 | def test_large_file_delta_round_trip(self) -> None: |
| 194 | """1 MB file with a small edit round-trips correctly.""" |
| 195 | base = (b"the quick brown fox\n") * 50_000 # ~1 MB |
| 196 | change_pos = 250_000 |
| 197 | target = base[:change_pos] + b"changed" + base[change_pos + 7 :] |
| 198 | delta = compute_delta(base, target) |
| 199 | assert apply_delta(base, delta) == target |
| 200 | |
| 201 | def test_midi_like_binary_round_trip(self) -> None: |
| 202 | """MIDI-like binary payload round-trips correctly.""" |
| 203 | # Simulate a MIDI file: fixed header + variable event stream. |
| 204 | header = bytes([0x4D, 0x54, 0x68, 0x64, 0x00, 0x00, 0x00, 0x06]) |
| 205 | base = header + bytes(range(256)) * 40 |
| 206 | # Simulate a one-byte velocity change somewhere in the middle. |
| 207 | target = bytearray(base) |
| 208 | target[100] = (target[100] + 1) % 256 |
| 209 | target_bytes = bytes(target) |
| 210 | delta = compute_delta(base, target_bytes) |
| 211 | assert apply_delta(base, delta) == target_bytes |
File History
1 commit
sha256:248464b6a2f758985cbef90f864fa62c61842be699d975d6e00b6a9509ef919c
fix(delta): detect blob-identical file renames for files wi…
Sonnet 4.6
patch
23 days ago