test_stress_object_store.py
python
sha256:d11a87833d5fad6059b7662844bf5448a8911a17cce7a51811f71ad394f248eb
bump to v0.2.0rc13
Human
patch
6 days ago
| 1 | """Stress tests for the content-addressed object store. |
| 2 | |
| 3 | Exercises: |
| 4 | - Write-then-read round-trip for varied payload sizes (1 byte … 10 MB). |
| 5 | - Idempotency: writing the same object ID twice is a no-op. |
| 6 | - has_object before and after writes. |
| 7 | - object_path sharding: first two hex chars as directory. |
| 8 | - read_object returns None for absent objects. |
| 9 | - restore_object copies bytes faithfully. |
| 10 | - write_object_from_path uses copy semantics, not load. |
| 11 | - Content integrity: read(write(content)) == content. |
| 12 | - Multiple distinct objects coexist without collision. |
| 13 | """ |
| 14 | |
| 15 | import os |
| 16 | import pathlib |
| 17 | import secrets |
| 18 | |
| 19 | import pytest |
| 20 | |
| 21 | from muse.core.object_store import ( |
| 22 | has_object, |
| 23 | object_path, |
| 24 | objects_dir, |
| 25 | read_object, |
| 26 | restore_object, |
| 27 | write_object, |
| 28 | write_object_from_path, |
| 29 | ) |
| 30 | from muse.core.paths import muse_dir |
| 31 | from muse.core.types import blob_id, long_id, fake_id |
| 32 | |
| 33 | |
| 34 | # --------------------------------------------------------------------------- |
| 35 | # Helpers |
| 36 | # --------------------------------------------------------------------------- |
| 37 | |
| 38 | |
| 39 | @pytest.fixture |
| 40 | def repo(tmp_path: pathlib.Path) -> pathlib.Path: |
| 41 | muse_dir(tmp_path).mkdir() |
| 42 | return tmp_path |
| 43 | |
| 44 | |
| 45 | # --------------------------------------------------------------------------- |
| 46 | # Basic round-trip |
| 47 | # --------------------------------------------------------------------------- |
| 48 | |
| 49 | |
| 50 | class TestRoundTrip: |
| 51 | def test_write_then_read_small(self, repo: pathlib.Path) -> None: |
| 52 | data = b"hello muse" |
| 53 | oid = blob_id(data) |
| 54 | write_object(repo, oid, data) |
| 55 | assert read_object(repo, oid) == data |
| 56 | |
| 57 | def test_write_then_read_empty(self, repo: pathlib.Path) -> None: |
| 58 | data = b"" |
| 59 | oid = blob_id(data) |
| 60 | write_object(repo, oid, data) |
| 61 | assert read_object(repo, oid) == data |
| 62 | |
| 63 | def test_write_then_read_single_byte(self, repo: pathlib.Path) -> None: |
| 64 | data = b"\x00" |
| 65 | oid = blob_id(data) |
| 66 | write_object(repo, oid, data) |
| 67 | assert read_object(repo, oid) == data |
| 68 | |
| 69 | def test_write_then_read_binary(self, repo: pathlib.Path) -> None: |
| 70 | data = bytes(range(256)) * 100 |
| 71 | oid = blob_id(data) |
| 72 | write_object(repo, oid, data) |
| 73 | assert read_object(repo, oid) == data |
| 74 | |
| 75 | @pytest.mark.parametrize("size", [1, 100, 4096, 65536, 1_000_000]) |
| 76 | def test_write_then_read_various_sizes(self, repo: pathlib.Path, size: int) -> None: |
| 77 | data = secrets.token_bytes(size) |
| 78 | oid = blob_id(data) |
| 79 | assert write_object(repo, oid, data) is True |
| 80 | assert read_object(repo, oid) == data |
| 81 | |
| 82 | def test_content_integrity(self, repo: pathlib.Path) -> None: |
| 83 | """Read back exactly what was written — not a truncated or padded version.""" |
| 84 | for i in range(20): |
| 85 | data = f"object-content-{i}-{'x' * i}".encode() |
| 86 | oid = blob_id(data) |
| 87 | write_object(repo, oid, data) |
| 88 | recovered = read_object(repo, oid) |
| 89 | assert recovered == data |
| 90 | assert len(recovered) == len(data) |
| 91 | |
| 92 | |
| 93 | # --------------------------------------------------------------------------- |
| 94 | # Idempotency |
| 95 | # --------------------------------------------------------------------------- |
| 96 | |
| 97 | |
| 98 | class TestIdempotency: |
| 99 | def test_double_write_returns_false_second_time(self, repo: pathlib.Path) -> None: |
| 100 | data = b"idempotent" |
| 101 | oid = blob_id(data) |
| 102 | assert write_object(repo, oid, data) is True |
| 103 | assert write_object(repo, oid, data) is False |
| 104 | |
| 105 | def test_double_write_does_not_corrupt(self, repo: pathlib.Path) -> None: |
| 106 | data = b"original content" |
| 107 | oid = blob_id(data) |
| 108 | write_object(repo, oid, data) |
| 109 | # Writing different content with the same ID raises ValueError (integrity check). |
| 110 | # The object on disk is NOT overwritten — idempotency guard fires first. |
| 111 | with pytest.raises(ValueError, match="Content integrity failure"): |
| 112 | write_object(repo, oid, b"different content") |
| 113 | assert read_object(repo, oid) == data |
| 114 | |
| 115 | def test_triple_write_stays_stable(self, repo: pathlib.Path) -> None: |
| 116 | data = b"triple-write" |
| 117 | oid = blob_id(data) |
| 118 | for _ in range(3): |
| 119 | write_object(repo, oid, data) |
| 120 | assert read_object(repo, oid) == data |
| 121 | |
| 122 | |
| 123 | # --------------------------------------------------------------------------- |
| 124 | # has_object |
| 125 | # --------------------------------------------------------------------------- |
| 126 | |
| 127 | |
| 128 | class TestHasObject: |
| 129 | def test_absent_before_write(self, repo: pathlib.Path) -> None: |
| 130 | oid = blob_id(b"not yet written") |
| 131 | assert not has_object(repo, oid) |
| 132 | |
| 133 | def test_present_after_write(self, repo: pathlib.Path) -> None: |
| 134 | data = b"present" |
| 135 | oid = blob_id(data) |
| 136 | write_object(repo, oid, data) |
| 137 | assert has_object(repo, oid) |
| 138 | |
| 139 | def test_other_objects_dont_shadow(self, repo: pathlib.Path) -> None: |
| 140 | a = b"object-a" |
| 141 | b_ = b"object-b" |
| 142 | oid_a = blob_id(a) |
| 143 | oid_b = blob_id(b_) |
| 144 | write_object(repo, oid_a, a) |
| 145 | assert has_object(repo, oid_a) |
| 146 | assert not has_object(repo, oid_b) |
| 147 | write_object(repo, oid_b, b_) |
| 148 | assert has_object(repo, oid_b) |
| 149 | |
| 150 | |
| 151 | # --------------------------------------------------------------------------- |
| 152 | # Absent objects |
| 153 | # --------------------------------------------------------------------------- |
| 154 | |
| 155 | |
| 156 | class TestAbsentObjects: |
| 157 | def test_read_absent_returns_none(self, repo: pathlib.Path) -> None: |
| 158 | fake_oid = fake_id("absent-a") |
| 159 | assert read_object(repo, fake_oid) is None |
| 160 | |
| 161 | def test_restore_absent_returns_false(self, repo: pathlib.Path, tmp_path: pathlib.Path) -> None: |
| 162 | fake_oid = fake_id("absent-b") |
| 163 | dest = tmp_path / "restored.bin" |
| 164 | result = restore_object(repo, fake_oid, dest) |
| 165 | assert result is False |
| 166 | assert not dest.exists() |
| 167 | |
| 168 | def test_has_object_false_for_random_id(self, repo: pathlib.Path) -> None: |
| 169 | for _ in range(10): |
| 170 | assert not has_object(repo, long_id(secrets.token_hex(32))) |
| 171 | |
| 172 | |
| 173 | # --------------------------------------------------------------------------- |
| 174 | # Sharding layout |
| 175 | # --------------------------------------------------------------------------- |
| 176 | |
| 177 | |
| 178 | class TestSharding: |
| 179 | def test_object_path_uses_first_two_chars_as_dir(self, repo: pathlib.Path) -> None: |
| 180 | oid = long_id(f"ab{'c' * 62}") |
| 181 | path = object_path(repo, oid) |
| 182 | assert path.parent.name == "ab" |
| 183 | assert path.name == "c" * 62 |
| 184 | |
| 185 | def test_objects_with_same_prefix_go_to_same_shard(self, repo: pathlib.Path) -> None: |
| 186 | oid1 = long_id(f"ff{'0' * 62}") |
| 187 | oid2 = long_id(f"ff{'1' * 62}") |
| 188 | assert object_path(repo, oid1).parent == object_path(repo, oid2).parent |
| 189 | |
| 190 | def test_objects_with_different_prefix_go_to_different_shards(self, repo: pathlib.Path) -> None: |
| 191 | # Use valid 64-char hex IDs with different first-two-char prefixes. |
| 192 | oid1 = long_id(f"aa{'f' * 62}") |
| 193 | oid2 = long_id(f"bb{'f' * 62}") |
| 194 | assert object_path(repo, oid1).parent != object_path(repo, oid2).parent |
| 195 | |
| 196 | def test_256_shards_can_all_be_created(self, repo: pathlib.Path) -> None: |
| 197 | """Write one object per possible shard prefix (00-ff). |
| 198 | |
| 199 | Finds data whose SHA-256 starts with each 2-hex prefix by brute-force, |
| 200 | using a counter to stay deterministic. |
| 201 | """ |
| 202 | import itertools |
| 203 | written_prefixes: set[str] = set() |
| 204 | for n in itertools.count(): |
| 205 | if len(written_prefixes) == 256: |
| 206 | break |
| 207 | data = f"shard-seed-{n}".encode() |
| 208 | oid = blob_id(data) |
| 209 | prefix = object_path(repo, oid).parent.name |
| 210 | if prefix not in written_prefixes: |
| 211 | write_object(repo, oid, data) |
| 212 | written_prefixes.add(prefix) |
| 213 | # Verify all 256 shard dirs exist under the sha256/ algo directory. |
| 214 | algo_dir = objects_dir(repo) / "sha256" |
| 215 | shards = [d.name for d in algo_dir.iterdir() if d.is_dir()] |
| 216 | assert len(shards) == 256 |
| 217 | |
| 218 | |
| 219 | # --------------------------------------------------------------------------- |
| 220 | # write_object_from_path |
| 221 | # --------------------------------------------------------------------------- |
| 222 | |
| 223 | |
| 224 | class TestWriteObjectFromPath: |
| 225 | def test_from_path_round_trip(self, repo: pathlib.Path, tmp_path: pathlib.Path) -> None: |
| 226 | src = tmp_path / "source.bin" |
| 227 | data = b"from-path-content" |
| 228 | src.write_bytes(data) |
| 229 | oid = blob_id(data) |
| 230 | assert write_object_from_path(repo, oid, src) is True |
| 231 | assert read_object(repo, oid) == data |
| 232 | |
| 233 | def test_from_path_idempotent(self, repo: pathlib.Path, tmp_path: pathlib.Path) -> None: |
| 234 | src = tmp_path / "idem.bin" |
| 235 | data = b"idempotent-from-path" |
| 236 | src.write_bytes(data) |
| 237 | oid = blob_id(data) |
| 238 | write_object_from_path(repo, oid, src) |
| 239 | assert write_object_from_path(repo, oid, src) is False |
| 240 | |
| 241 | |
| 242 | # --------------------------------------------------------------------------- |
| 243 | # restore_object |
| 244 | # --------------------------------------------------------------------------- |
| 245 | |
| 246 | |
| 247 | class TestRestoreObject: |
| 248 | def test_restore_round_trip(self, repo: pathlib.Path, tmp_path: pathlib.Path) -> None: |
| 249 | data = b"restore-me" |
| 250 | oid = blob_id(data) |
| 251 | write_object(repo, oid, data) |
| 252 | dest = tmp_path / "sub" / "restored.bin" |
| 253 | assert restore_object(repo, oid, dest) is True |
| 254 | assert dest.read_bytes() == data |
| 255 | |
| 256 | def test_restore_creates_parent_dirs(self, repo: pathlib.Path, tmp_path: pathlib.Path) -> None: |
| 257 | data = b"deep-restore" |
| 258 | oid = blob_id(data) |
| 259 | write_object(repo, oid, data) |
| 260 | dest = tmp_path / "a" / "b" / "c" / "file.bin" |
| 261 | restore_object(repo, oid, dest) |
| 262 | assert dest.exists() |
| 263 | |
| 264 | def test_restore_large_object_intact(self, repo: pathlib.Path, tmp_path: pathlib.Path) -> None: |
| 265 | data = secrets.token_bytes(2_000_000) |
| 266 | oid = blob_id(data) |
| 267 | write_object(repo, oid, data) |
| 268 | dest = tmp_path / "large.bin" |
| 269 | restore_object(repo, oid, dest) |
| 270 | assert dest.read_bytes() == data |
| 271 | |
| 272 | |
| 273 | # --------------------------------------------------------------------------- |
| 274 | # Multiple distinct objects |
| 275 | # --------------------------------------------------------------------------- |
| 276 | |
| 277 | |
| 278 | class TestMultipleObjects: |
| 279 | def test_100_distinct_objects_coexist(self, repo: pathlib.Path) -> None: |
| 280 | written: _FileStore = {} |
| 281 | for i in range(100): |
| 282 | data = f"payload-{i:03d}-{'z' * i}".encode() |
| 283 | oid = blob_id(data) |
| 284 | write_object(repo, oid, data) |
| 285 | written[oid] = data |
| 286 | |
| 287 | for oid, data in written.items(): |
| 288 | assert read_object(repo, oid) == data |
| 289 | |
| 290 | def test_all_objects_independently_addressable(self, repo: pathlib.Path) -> None: |
| 291 | """Verify no two distinct objects collide in the store.""" |
| 292 | oids: list[str] = [] |
| 293 | for i in range(50): |
| 294 | data = secrets.token_bytes(64) |
| 295 | oid = blob_id(data) |
| 296 | write_object(repo, oid, data) |
| 297 | oids.append(oid) |
| 298 | # All OIDs should be unique (probabilistic but essentially certain). |
| 299 | assert len(set(oids)) == 50 |
File History
1 commit
sha256:d11a87833d5fad6059b7662844bf5448a8911a17cce7a51811f71ad394f248eb
bump to v0.2.0rc13
Human
patch
6 days ago