gabriel / muse public
test_compression.py python
211 lines 8.9 KB
Raw
sha256:248464b6a2f758985cbef90f864fa62c61842be699d975d6e00b6a9509ef919c fix(delta): detect blob-identical file renames for files wi… Sonnet 4.6 patch 23 days ago
1 """Unit tests for muse.core.compression — zlib (Tier 1) and delta (Tier 2)."""
2
3 from __future__ import annotations
4
5 import zlib
6
7 import pytest
8
9 from muse.core.compression import (
10 _COPY_THRESHOLD,
11 apply_delta,
12 compress_zlib,
13 compute_delta,
14 decompress_zlib,
15 )
16
17
18 # ---------------------------------------------------------------------------
19 # Tier 1 — zlib round-trip
20 # ---------------------------------------------------------------------------
21
22
23 class TestZlib:
24 def test_round_trip_source_code(self) -> None:
25 """Compressing and decompressing Python source returns identical bytes."""
26 data = b"def foo(x):\n return x * 2\n" * 100
27 assert decompress_zlib(compress_zlib(data)) == data
28
29 def test_round_trip_binary(self) -> None:
30 """Binary data round-trips correctly."""
31 data = bytes(range(256)) * 128
32 assert decompress_zlib(compress_zlib(data)) == data
33
34 def test_round_trip_empty(self) -> None:
35 """Empty bytes compress and decompress cleanly."""
36 assert decompress_zlib(compress_zlib(b"")) == b""
37
38 def test_compress_reduces_size_for_text(self) -> None:
39 """Repetitive text compresses to fewer bytes."""
40 data = b"hello world " * 1000
41 assert len(compress_zlib(data)) < len(data)
42
43 def test_decompress_corrupt_raises(self) -> None:
44 """Corrupt input to decompress_zlib raises zlib.error."""
45 with pytest.raises(zlib.error):
46 decompress_zlib(b"not zlib data")
47
48 def test_compress_output_is_valid_zlib(self) -> None:
49 """compress_zlib output is valid zlib (decompressible by the stdlib directly)."""
50 data = b"test payload"
51 compressed = compress_zlib(data)
52 assert zlib.decompress(compressed) == data
53
54
55 # ---------------------------------------------------------------------------
56 # Tier 2 — delta round-trip
57 # ---------------------------------------------------------------------------
58
59
60 class TestDelta:
61 def test_round_trip_identical_content(self) -> None:
62 """Delta of a file against itself reconstructs the original."""
63 data = b"unchanged content" * 50
64 delta = compute_delta(data, data)
65 assert apply_delta(data, delta) == data
66
67 def test_round_trip_small_edit(self) -> None:
68 """A one-byte change in a large file round-trips correctly."""
69 base = b"the quick brown fox jumps over the lazy dog" * 100
70 target = base[:500] + b"X" + base[501:]
71 delta = compute_delta(base, target)
72 assert apply_delta(base, delta) == target
73
74 def test_round_trip_append(self) -> None:
75 """Appending bytes to a file round-trips correctly."""
76 base = b"existing content\n" * 20
77 target = base + b"new line appended\n"
78 delta = compute_delta(base, target)
79 assert apply_delta(base, delta) == target
80
81 def test_round_trip_prepend(self) -> None:
82 """Prepending bytes to a file round-trips correctly."""
83 base = b"existing content\n" * 20
84 target = b"new header\n" + base
85 delta = compute_delta(base, target)
86 assert apply_delta(base, delta) == target
87
88 def test_apply_delta_pure_data_instruction(self) -> None:
89 """apply_delta correctly reconstructs target from a pure DATA instruction stream.
90
91 compute_delta raises ValueError when DATA-only deltas aren't smaller than
92 plain zlib (completely different content has no COPY opportunities and DATA
93 framing adds overhead). We verify the decoder directly with a hand-built stream.
94 """
95 import struct
96 target = b"completely different from base " * 20
97 stream = b"\x01" + struct.pack(">I", len(target)) + target
98 delta = zlib.compress(stream, level=1)
99 base = b"unrelated source material " * 20
100 assert apply_delta(base, delta) == target
101
102 def test_round_trip_empty_base(self) -> None:
103 """Empty base produces a pure DATA delta that reconstructs target."""
104 base = b""
105 target = b"new file content"
106 delta = compute_delta(base, target)
107 assert apply_delta(base, delta) == target
108
109 def test_apply_delta_empty_target(self) -> None:
110 """apply_delta with an empty instruction stream produces empty bytes.
111
112 An empty target has no delta instructions (nothing to emit), so the
113 compressed stream is just zlib(b"") — the same size as compress_zlib(b"").
114 compute_delta raises ValueError for this case (not profitable). We verify
115 the decoder directly: an empty instruction stream reconstructs b"".
116 """
117 delta = zlib.compress(b"", level=1)
118 base = b"some existing content"
119 assert apply_delta(base, delta) == b""
120
121 def test_round_trip_source_file(self) -> None:
122 """Simulated source-file edit: add a function at the end."""
123 base = (
124 b"def foo():\n pass\n\n"
125 b"def bar():\n return 1\n\n"
126 ) * 30
127 target = base + b"def baz():\n return 2\n"
128 delta = compute_delta(base, target)
129 assert apply_delta(base, delta) == target
130
131 def test_delta_smaller_than_zlib_for_small_edit(self) -> None:
132 """Delta should be smaller than plain zlib for a small edit in a large file."""
133 base = b"stable content\n" * 500
134 target = base[:1000] + b"changed line\n" + base[1013:]
135 delta = compute_delta(base, target)
136 plain = compress_zlib(target)
137 assert len(delta) < len(plain), (
138 f"Expected delta ({len(delta)}) < zlib ({len(plain)})"
139 )
140
141 def test_unprofitable_delta_raises_value_error(self) -> None:
142 """compute_delta raises ValueError when delta >= zlib(target)."""
143 # Completely random-looking data has no copy opportunities and the
144 # overhead of the delta format makes it larger than plain zlib.
145 import os
146 base = os.urandom(64)
147 target = os.urandom(64)
148 with pytest.raises(ValueError, match="not profitable"):
149 compute_delta(base, target)
150
151 def test_apply_delta_corrupt_raises(self) -> None:
152 """apply_delta raises zlib.error on corrupt compressed input."""
153 with pytest.raises(zlib.error):
154 apply_delta(b"base", b"not zlib")
155
156 def test_apply_delta_unknown_instruction_raises(self) -> None:
157 """apply_delta raises ValueError on an unknown instruction byte."""
158 import struct
159 # Craft a stream with an invalid instruction byte (0x99).
160 bad_stream = zlib.compress(b"\x99" + struct.pack(">I", 0))
161 with pytest.raises(ValueError, match="unknown delta instruction type"):
162 apply_delta(b"base", bad_stream)
163
164 def test_copy_threshold_respected(self) -> None:
165 """Only runs of >= _COPY_THRESHOLD bytes produce COPY instructions.
166
167 Small data (< _COPY_THRESHOLD bytes) has no 32-byte hash windows, so
168 compute_delta emits DATA and the delta is not profitable (overhead > savings).
169 We verify the decoder handles a hand-built mixed COPY+DATA stream where
170 a short tail that was below threshold is encoded as DATA.
171 """
172 import struct
173 # Large stable prefix — would produce COPY in a real delta.
174 stable = b"the quick brown fox jumps over the lazy dog\n" * 40 # 1760 bytes
175 # Short unique tail shorter than _COPY_THRESHOLD.
176 short_tail = b"Z" * (_COPY_THRESHOLD - 1) # 31 bytes
177
178 base = stable + short_tail
179 target = stable + short_tail # same — round-trip check
180
181 # Build a hand-crafted delta:
182 # COPY the entire base (COPY instruction is valid since base == target)
183 # …but simulate the DATA path by encoding just the tail as DATA.
184 # Build as: COPY(0, len(stable)) then DATA(short_tail)
185 copy_instr = b"\x00" + struct.pack(">II", 0, len(stable))
186 data_instr = b"\x01" + struct.pack(">I", len(short_tail)) + short_tail
187 stream = copy_instr + data_instr
188 delta = zlib.compress(stream, level=1)
189
190 result = apply_delta(base, delta)
191 assert result == target
192
193 def test_large_file_delta_round_trip(self) -> None:
194 """1 MB file with a small edit round-trips correctly."""
195 base = (b"the quick brown fox\n") * 50_000 # ~1 MB
196 change_pos = 250_000
197 target = base[:change_pos] + b"changed" + base[change_pos + 7 :]
198 delta = compute_delta(base, target)
199 assert apply_delta(base, delta) == target
200
201 def test_midi_like_binary_round_trip(self) -> None:
202 """MIDI-like binary payload round-trips correctly."""
203 # Simulate a MIDI file: fixed header + variable event stream.
204 header = bytes([0x4D, 0x54, 0x68, 0x64, 0x00, 0x00, 0x00, 0x06])
205 base = header + bytes(range(256)) * 40
206 # Simulate a one-byte velocity change somewhere in the middle.
207 target = bytearray(base)
208 target[100] = (target[100] + 1) % 256
209 target_bytes = bytes(target)
210 delta = compute_delta(base, target_bytes)
211 assert apply_delta(base, delta) == target_bytes
File History 1 commit
sha256:248464b6a2f758985cbef90f864fa62c61842be699d975d6e00b6a9509ef919c fix(delta): detect blob-identical file renames for files wi… Sonnet 4.6 patch 23 days ago