gabriel / muse public
test_stress_object_store.py python
299 lines 10.9 KB
Raw
1 """Stress tests for the content-addressed object store.
2
3 Exercises:
4 - Write-then-read round-trip for varied payload sizes (1 byte … 10 MB).
5 - Idempotency: writing the same object ID twice is a no-op.
6 - has_object before and after writes.
7 - object_path sharding: first two hex chars as directory.
8 - read_object returns None for absent objects.
9 - restore_object copies bytes faithfully.
10 - write_object_from_path uses copy semantics, not load.
11 - Content integrity: read(write(content)) == content.
12 - Multiple distinct objects coexist without collision.
13 """
14
15 import os
16 import pathlib
17 import secrets
18
19 import pytest
20
21 from muse.core.object_store import (
22 has_object,
23 object_path,
24 objects_dir,
25 read_object,
26 restore_object,
27 write_object,
28 write_object_from_path,
29 )
30 from muse.core.paths import muse_dir
31 from muse.core.types import blob_id, long_id, fake_id
32
33
34 # ---------------------------------------------------------------------------
35 # Helpers
36 # ---------------------------------------------------------------------------
37
38
39 @pytest.fixture
40 def repo(tmp_path: pathlib.Path) -> pathlib.Path:
41 muse_dir(tmp_path).mkdir()
42 return tmp_path
43
44
45 # ---------------------------------------------------------------------------
46 # Basic round-trip
47 # ---------------------------------------------------------------------------
48
49
50 class TestRoundTrip:
51 def test_write_then_read_small(self, repo: pathlib.Path) -> None:
52 data = b"hello muse"
53 oid = blob_id(data)
54 write_object(repo, oid, data)
55 assert read_object(repo, oid) == data
56
57 def test_write_then_read_empty(self, repo: pathlib.Path) -> None:
58 data = b""
59 oid = blob_id(data)
60 write_object(repo, oid, data)
61 assert read_object(repo, oid) == data
62
63 def test_write_then_read_single_byte(self, repo: pathlib.Path) -> None:
64 data = b"\x00"
65 oid = blob_id(data)
66 write_object(repo, oid, data)
67 assert read_object(repo, oid) == data
68
69 def test_write_then_read_binary(self, repo: pathlib.Path) -> None:
70 data = bytes(range(256)) * 100
71 oid = blob_id(data)
72 write_object(repo, oid, data)
73 assert read_object(repo, oid) == data
74
75 @pytest.mark.parametrize("size", [1, 100, 4096, 65536, 1_000_000])
76 def test_write_then_read_various_sizes(self, repo: pathlib.Path, size: int) -> None:
77 data = secrets.token_bytes(size)
78 oid = blob_id(data)
79 assert write_object(repo, oid, data) is True
80 assert read_object(repo, oid) == data
81
82 def test_content_integrity(self, repo: pathlib.Path) -> None:
83 """Read back exactly what was written — not a truncated or padded version."""
84 for i in range(20):
85 data = f"object-content-{i}-{'x' * i}".encode()
86 oid = blob_id(data)
87 write_object(repo, oid, data)
88 recovered = read_object(repo, oid)
89 assert recovered == data
90 assert len(recovered) == len(data)
91
92
93 # ---------------------------------------------------------------------------
94 # Idempotency
95 # ---------------------------------------------------------------------------
96
97
98 class TestIdempotency:
99 def test_double_write_returns_false_second_time(self, repo: pathlib.Path) -> None:
100 data = b"idempotent"
101 oid = blob_id(data)
102 assert write_object(repo, oid, data) is True
103 assert write_object(repo, oid, data) is False
104
105 def test_double_write_does_not_corrupt(self, repo: pathlib.Path) -> None:
106 data = b"original content"
107 oid = blob_id(data)
108 write_object(repo, oid, data)
109 # Writing different content with the same ID raises ValueError (integrity check).
110 # The object on disk is NOT overwritten — idempotency guard fires first.
111 with pytest.raises(ValueError, match="Content integrity failure"):
112 write_object(repo, oid, b"different content")
113 assert read_object(repo, oid) == data
114
115 def test_triple_write_stays_stable(self, repo: pathlib.Path) -> None:
116 data = b"triple-write"
117 oid = blob_id(data)
118 for _ in range(3):
119 write_object(repo, oid, data)
120 assert read_object(repo, oid) == data
121
122
123 # ---------------------------------------------------------------------------
124 # has_object
125 # ---------------------------------------------------------------------------
126
127
128 class TestHasObject:
129 def test_absent_before_write(self, repo: pathlib.Path) -> None:
130 oid = blob_id(b"not yet written")
131 assert not has_object(repo, oid)
132
133 def test_present_after_write(self, repo: pathlib.Path) -> None:
134 data = b"present"
135 oid = blob_id(data)
136 write_object(repo, oid, data)
137 assert has_object(repo, oid)
138
139 def test_other_objects_dont_shadow(self, repo: pathlib.Path) -> None:
140 a = b"object-a"
141 b_ = b"object-b"
142 oid_a = blob_id(a)
143 oid_b = blob_id(b_)
144 write_object(repo, oid_a, a)
145 assert has_object(repo, oid_a)
146 assert not has_object(repo, oid_b)
147 write_object(repo, oid_b, b_)
148 assert has_object(repo, oid_b)
149
150
151 # ---------------------------------------------------------------------------
152 # Absent objects
153 # ---------------------------------------------------------------------------
154
155
156 class TestAbsentObjects:
157 def test_read_absent_returns_none(self, repo: pathlib.Path) -> None:
158 fake_oid = fake_id("absent-a")
159 assert read_object(repo, fake_oid) is None
160
161 def test_restore_absent_returns_false(self, repo: pathlib.Path, tmp_path: pathlib.Path) -> None:
162 fake_oid = fake_id("absent-b")
163 dest = tmp_path / "restored.bin"
164 result = restore_object(repo, fake_oid, dest)
165 assert result is False
166 assert not dest.exists()
167
168 def test_has_object_false_for_random_id(self, repo: pathlib.Path) -> None:
169 for _ in range(10):
170 assert not has_object(repo, long_id(secrets.token_hex(32)))
171
172
173 # ---------------------------------------------------------------------------
174 # Sharding layout
175 # ---------------------------------------------------------------------------
176
177
178 class TestSharding:
179 def test_object_path_uses_first_two_chars_as_dir(self, repo: pathlib.Path) -> None:
180 oid = long_id(f"ab{'c' * 62}")
181 path = object_path(repo, oid)
182 assert path.parent.name == "ab"
183 assert path.name == "c" * 62
184
185 def test_objects_with_same_prefix_go_to_same_shard(self, repo: pathlib.Path) -> None:
186 oid1 = long_id(f"ff{'0' * 62}")
187 oid2 = long_id(f"ff{'1' * 62}")
188 assert object_path(repo, oid1).parent == object_path(repo, oid2).parent
189
190 def test_objects_with_different_prefix_go_to_different_shards(self, repo: pathlib.Path) -> None:
191 # Use valid 64-char hex IDs with different first-two-char prefixes.
192 oid1 = long_id(f"aa{'f' * 62}")
193 oid2 = long_id(f"bb{'f' * 62}")
194 assert object_path(repo, oid1).parent != object_path(repo, oid2).parent
195
196 def test_256_shards_can_all_be_created(self, repo: pathlib.Path) -> None:
197 """Write one object per possible shard prefix (00-ff).
198
199 Finds data whose SHA-256 starts with each 2-hex prefix by brute-force,
200 using a counter to stay deterministic.
201 """
202 import itertools
203 written_prefixes: set[str] = set()
204 for n in itertools.count():
205 if len(written_prefixes) == 256:
206 break
207 data = f"shard-seed-{n}".encode()
208 oid = blob_id(data)
209 prefix = object_path(repo, oid).parent.name
210 if prefix not in written_prefixes:
211 write_object(repo, oid, data)
212 written_prefixes.add(prefix)
213 # Verify all 256 shard dirs exist under the sha256/ algo directory.
214 algo_dir = objects_dir(repo) / "sha256"
215 shards = [d.name for d in algo_dir.iterdir() if d.is_dir()]
216 assert len(shards) == 256
217
218
219 # ---------------------------------------------------------------------------
220 # write_object_from_path
221 # ---------------------------------------------------------------------------
222
223
224 class TestWriteObjectFromPath:
225 def test_from_path_round_trip(self, repo: pathlib.Path, tmp_path: pathlib.Path) -> None:
226 src = tmp_path / "source.bin"
227 data = b"from-path-content"
228 src.write_bytes(data)
229 oid = blob_id(data)
230 assert write_object_from_path(repo, oid, src) is True
231 assert read_object(repo, oid) == data
232
233 def test_from_path_idempotent(self, repo: pathlib.Path, tmp_path: pathlib.Path) -> None:
234 src = tmp_path / "idem.bin"
235 data = b"idempotent-from-path"
236 src.write_bytes(data)
237 oid = blob_id(data)
238 write_object_from_path(repo, oid, src)
239 assert write_object_from_path(repo, oid, src) is False
240
241
242 # ---------------------------------------------------------------------------
243 # restore_object
244 # ---------------------------------------------------------------------------
245
246
247 class TestRestoreObject:
248 def test_restore_round_trip(self, repo: pathlib.Path, tmp_path: pathlib.Path) -> None:
249 data = b"restore-me"
250 oid = blob_id(data)
251 write_object(repo, oid, data)
252 dest = tmp_path / "sub" / "restored.bin"
253 assert restore_object(repo, oid, dest) is True
254 assert dest.read_bytes() == data
255
256 def test_restore_creates_parent_dirs(self, repo: pathlib.Path, tmp_path: pathlib.Path) -> None:
257 data = b"deep-restore"
258 oid = blob_id(data)
259 write_object(repo, oid, data)
260 dest = tmp_path / "a" / "b" / "c" / "file.bin"
261 restore_object(repo, oid, dest)
262 assert dest.exists()
263
264 def test_restore_large_object_intact(self, repo: pathlib.Path, tmp_path: pathlib.Path) -> None:
265 data = secrets.token_bytes(2_000_000)
266 oid = blob_id(data)
267 write_object(repo, oid, data)
268 dest = tmp_path / "large.bin"
269 restore_object(repo, oid, dest)
270 assert dest.read_bytes() == data
271
272
273 # ---------------------------------------------------------------------------
274 # Multiple distinct objects
275 # ---------------------------------------------------------------------------
276
277
278 class TestMultipleObjects:
279 def test_100_distinct_objects_coexist(self, repo: pathlib.Path) -> None:
280 written: _FileStore = {}
281 for i in range(100):
282 data = f"payload-{i:03d}-{'z' * i}".encode()
283 oid = blob_id(data)
284 write_object(repo, oid, data)
285 written[oid] = data
286
287 for oid, data in written.items():
288 assert read_object(repo, oid) == data
289
290 def test_all_objects_independently_addressable(self, repo: pathlib.Path) -> None:
291 """Verify no two distinct objects collide in the store."""
292 oids: list[str] = []
293 for i in range(50):
294 data = secrets.token_bytes(64)
295 oid = blob_id(data)
296 write_object(repo, oid, data)
297 oids.append(oid)
298 # All OIDs should be unique (probabilistic but essentially certain).
299 assert len(set(oids)) == 50
File History 1 commit