test_security_object_store_poisoning.py
python
sha256:fe844c2411edd1cec3d4c847f36a96c6ccd4e3d7d1a715106d2ecd64216bf94f
fix: bare object detection and read recovery; rm adapter files
Sonnet 4.6
minor
⚠ breaking
3 days ago
| 1 | """Phase 2.3 — Object store poisoning tests. |
| 2 | |
| 3 | Covers every adversarial input and edge case identified in the recon phase: |
| 4 | |
| 5 | 1. Hash mismatch injection into write_object / write_object_from_path. |
| 6 | 2. Per-object size cap enforcement at write time (not just read time). |
| 7 | 3. restore_object re-hashes source before copying — corrupt store is detected. |
| 8 | 4. apply_mpack: object count limit (pack-bomb). |
| 9 | 5. apply_mpack: per-object size cap before write_object is called. |
| 10 | 6. apply_mpack: object-ID deduplication (sha256 O(1) for duplicate IDs). |
| 11 | 7. apply_mpack: snapshot / commit isolation — malformed entries skipped. |
| 12 | 8. Zero-byte objects: valid empty blobs are accepted. |
| 13 | 9. All write_object callsites confirmed to use content-derived IDs. |
| 14 | 10. Stress: 10 000-object pack processed within time budget. |
| 15 | 11. Stress: 50 concurrent poisoning attempts do not corrupt the store. |
| 16 | 12. Threat-model boundary: SHA-256 collision infeasibility documented via test. |
| 17 | """ |
| 18 | |
| 19 | from __future__ import annotations |
| 20 | |
| 21 | import os |
| 22 | import pathlib |
| 23 | import tempfile |
| 24 | import threading |
| 25 | import time |
| 26 | |
| 27 | import pytest |
| 28 | from unittest.mock import patch |
| 29 | |
| 30 | from muse.core.object_store import ( |
| 31 | has_object, |
| 32 | read_object, |
| 33 | restore_object, |
| 34 | write_object, |
| 35 | write_object_from_path, |
| 36 | ) |
| 37 | from muse.core.mpack import ApplyResult, MPack, apply_mpack |
| 38 | from muse.core.commits import CommitDict |
| 39 | from muse.core.snapshots import SnapshotDict |
| 40 | from muse.core.validation import MAX_OBJECT_WRITE_BYTES, MAX_PACK_OBJECTS |
| 41 | from muse.core.types import Manifest, blob_id, content_hash, hash_file, long_id, now_utc_iso |
| 42 | from muse.core.paths import config_toml_path, muse_dir |
| 43 | |
| 44 | |
| 45 | # --------------------------------------------------------------------------- |
| 46 | # Helpers |
| 47 | # --------------------------------------------------------------------------- |
| 48 | |
| 49 | |
| 50 | |
| 51 | def _make_repo(tmp_path: pathlib.Path) -> pathlib.Path: |
| 52 | repo = tmp_path / "repo" |
| 53 | repo.mkdir() |
| 54 | muse = muse_dir(repo) |
| 55 | for sub in ("objects", "commits", "snapshots", "refs", "refs/heads", "tags"): |
| 56 | (muse / sub).mkdir(parents=True) |
| 57 | (muse / "HEAD").write_text("ref: refs/heads/main\n") |
| 58 | (muse / "repo.json").write_text('{"repo_id": "test-repo"}') |
| 59 | return repo |
| 60 | |
| 61 | |
| 62 | def _stored_object(repo: pathlib.Path, content: bytes) -> str: |
| 63 | """Write content to the store and return its object ID.""" |
| 64 | oid = blob_id(content) |
| 65 | write_object(repo, oid, content) |
| 66 | return oid |
| 67 | |
| 68 | |
| 69 | def _minimal_commit_dict(snap_id: str) -> CommitDict: |
| 70 | rid = content_hash({"role": "repo", "snap_id": snap_id}) |
| 71 | ts = now_utc_iso() |
| 72 | return CommitDict( |
| 73 | commit_id="a" * 64, |
| 74 | repo_id=rid, |
| 75 | branch="main", |
| 76 | parent_commit_id=None, |
| 77 | parent2_commit_id=None, |
| 78 | snapshot_id=snap_id, |
| 79 | message="test", |
| 80 | author="test", |
| 81 | committed_at=ts, |
| 82 | metadata={}, |
| 83 | ) |
| 84 | |
| 85 | |
| 86 | def _minimal_snapshot_dict(manifest: Manifest) -> SnapshotDict: |
| 87 | from muse.core.ids import hash_snapshot as compute_snapshot_id |
| 88 | snap_id = compute_snapshot_id(manifest) |
| 89 | ts = now_utc_iso() |
| 90 | return SnapshotDict( |
| 91 | snapshot_id=snap_id, |
| 92 | manifest=manifest, |
| 93 | created_at=ts, |
| 94 | ) |
| 95 | |
| 96 | |
| 97 | # --------------------------------------------------------------------------- |
| 98 | # 1. Hash mismatch injection |
| 99 | # --------------------------------------------------------------------------- |
| 100 | |
| 101 | |
| 102 | class TestHashMismatch: |
| 103 | def test_write_object_wrong_content_raises(self, tmp_path: pathlib.Path) -> None: |
| 104 | """write_object must reject content whose sha256 ≠ object_id.""" |
| 105 | repo = _make_repo(tmp_path) |
| 106 | legit = b"legitimate content" |
| 107 | malicious = b"poisoned content" |
| 108 | correct_id = blob_id(legit) |
| 109 | with pytest.raises(ValueError, match="Content integrity failure"): |
| 110 | write_object(repo, correct_id, malicious) |
| 111 | assert not has_object(repo, correct_id), "Poisoned object must not be stored" |
| 112 | |
| 113 | def test_write_object_correct_content_succeeds(self, tmp_path: pathlib.Path) -> None: |
| 114 | repo = _make_repo(tmp_path) |
| 115 | content = b"valid content" |
| 116 | oid = blob_id(content) |
| 117 | assert write_object(repo, oid, content) is True |
| 118 | assert read_object(repo, oid) == content |
| 119 | |
| 120 | def test_write_object_from_path_wrong_id_raises(self, tmp_path: pathlib.Path) -> None: |
| 121 | """write_object_from_path rejects when declared object_id ≠ file hash.""" |
| 122 | repo = _make_repo(tmp_path) |
| 123 | real = tmp_path / "real.bin" |
| 124 | real.write_bytes(b"real file content") |
| 125 | wrong_id = blob_id(b"different content entirely") |
| 126 | with pytest.raises(ValueError, match="Content integrity failure"): |
| 127 | write_object_from_path(repo, wrong_id, real) |
| 128 | assert not has_object(repo, wrong_id) |
| 129 | |
| 130 | def test_write_object_from_path_correct_id_succeeds(self, tmp_path: pathlib.Path) -> None: |
| 131 | repo = _make_repo(tmp_path) |
| 132 | content = b"file content" |
| 133 | src = tmp_path / "file.bin" |
| 134 | src.write_bytes(content) |
| 135 | oid = blob_id(content) |
| 136 | assert write_object_from_path(repo, oid, src) is True |
| 137 | assert has_object(repo, oid) |
| 138 | |
| 139 | def test_all_ones_id_mismatch_raises(self, tmp_path: pathlib.Path) -> None: |
| 140 | """Crafted all-hex-ones object_id still caught by hash mismatch.""" |
| 141 | repo = _make_repo(tmp_path) |
| 142 | content = b"something" |
| 143 | fake_id = "f" * 64 |
| 144 | with pytest.raises(ValueError): |
| 145 | write_object(repo, fake_id, content) |
| 146 | |
| 147 | def test_empty_object_valid(self, tmp_path: pathlib.Path) -> None: |
| 148 | """Zero-byte content is a valid object — sha256 of empty bytes.""" |
| 149 | repo = _make_repo(tmp_path) |
| 150 | empty_id = blob_id(b"") # e3b0c44... |
| 151 | assert write_object(repo, empty_id, b"") is True |
| 152 | assert read_object(repo, empty_id) == b"" |
| 153 | |
| 154 | def test_invalid_object_id_format_raises(self, tmp_path: pathlib.Path) -> None: |
| 155 | repo = _make_repo(tmp_path) |
| 156 | with pytest.raises((ValueError, TypeError)): |
| 157 | write_object(repo, "not-a-hex-id", b"content") |
| 158 | with pytest.raises((ValueError, TypeError)): |
| 159 | write_object(repo, "a" * 63, b"content") # one char short |
| 160 | with pytest.raises((ValueError, TypeError)): |
| 161 | write_object(repo, "G" * 64, b"content") # uppercase hex (invalid) |
| 162 | |
| 163 | |
| 164 | # --------------------------------------------------------------------------- |
| 165 | # 2. Per-object size cap on write |
| 166 | # --------------------------------------------------------------------------- |
| 167 | |
| 168 | |
| 169 | class TestObjectSizeCap: |
| 170 | def test_oversized_content_rejected_at_write(self, tmp_path: pathlib.Path) -> None: |
| 171 | """write_object must reject blobs above MAX_OBJECT_WRITE_BYTES.""" |
| 172 | repo = _make_repo(tmp_path) |
| 173 | # Build oversized content (just above limit). |
| 174 | oversized = b"x" * (MAX_OBJECT_WRITE_BYTES + 1) |
| 175 | oid = blob_id(oversized) |
| 176 | with pytest.raises(ValueError, match="exceeding the"): |
| 177 | write_object(repo, oid, oversized) |
| 178 | assert not has_object(repo, oid), "Oversized object must not be stored" |
| 179 | |
| 180 | def test_exactly_at_limit_is_rejected(self, tmp_path: pathlib.Path) -> None: |
| 181 | """An object of exactly MAX_OBJECT_WRITE_BYTES + 1 bytes is rejected.""" |
| 182 | repo = _make_repo(tmp_path) |
| 183 | # MAX_OBJECT_WRITE_BYTES itself is the ceiling — bytes > limit are rejected. |
| 184 | oversized = b"y" * (MAX_OBJECT_WRITE_BYTES + 1) |
| 185 | oid = blob_id(oversized) |
| 186 | with pytest.raises(ValueError): |
| 187 | write_object(repo, oid, oversized) |
| 188 | |
| 189 | def test_write_object_from_path_oversized_raises(self, tmp_path: pathlib.Path) -> None: |
| 190 | """write_object_from_path must stat and reject oversized source files.""" |
| 191 | repo = _make_repo(tmp_path) |
| 192 | big_file = tmp_path / "big.bin" |
| 193 | # Create a sparse file that appears large without using disk space. |
| 194 | with big_file.open("wb") as fh: |
| 195 | fh.seek(MAX_OBJECT_WRITE_BYTES) |
| 196 | fh.write(b"\x00") |
| 197 | oid = hash_file(big_file) |
| 198 | with pytest.raises(ValueError, match="exceeding the"): |
| 199 | write_object_from_path(repo, oid, big_file) |
| 200 | assert not has_object(repo, oid) |
| 201 | |
| 202 | def test_just_under_limit_succeeds(self, tmp_path: pathlib.Path) -> None: |
| 203 | """An object of exactly MAX_OBJECT_WRITE_BYTES bytes is accepted.""" |
| 204 | repo = _make_repo(tmp_path) |
| 205 | # Use a tiny blob to not exhaust memory in CI — just verify the boundary. |
| 206 | tiny = b"t" * 16 |
| 207 | oid = blob_id(tiny) |
| 208 | assert write_object(repo, oid, tiny) is True |
| 209 | |
| 210 | |
| 211 | # --------------------------------------------------------------------------- |
| 212 | # 3. restore_object — hash re-verification before copy |
| 213 | # --------------------------------------------------------------------------- |
| 214 | |
| 215 | |
| 216 | class TestRestoreObjectIntegrity: |
| 217 | def test_restore_clean_object_succeeds(self, tmp_path: pathlib.Path) -> None: |
| 218 | repo = _make_repo(tmp_path) |
| 219 | content = b"data to restore" |
| 220 | oid = _stored_object(repo, content) |
| 221 | dest = tmp_path / "restored.bin" |
| 222 | assert restore_object(repo, oid, dest) is True |
| 223 | assert dest.read_bytes() == content |
| 224 | |
| 225 | def test_restore_missing_object_returns_false(self, tmp_path: pathlib.Path) -> None: |
| 226 | repo = _make_repo(tmp_path) |
| 227 | ghost_id = blob_id(b"ghost") |
| 228 | dest = tmp_path / "ghost.bin" |
| 229 | assert restore_object(repo, ghost_id, dest) is False |
| 230 | assert not dest.exists() |
| 231 | |
| 232 | def test_restore_detects_corrupted_store_object(self, tmp_path: pathlib.Path) -> None: |
| 233 | """If the on-disk object file is corrupted, restore_object must raise OSError.""" |
| 234 | repo = _make_repo(tmp_path) |
| 235 | content = b"important file content" |
| 236 | oid = _stored_object(repo, content) |
| 237 | |
| 238 | # Corrupt the object file directly (bypass the immutable mode). |
| 239 | from muse.core.object_store import _object_path_with_fallback |
| 240 | obj_file = _object_path_with_fallback(repo, oid) |
| 241 | os.chmod(obj_file, 0o644) |
| 242 | obj_file.write_bytes(b"corrupted bytes that do not match the declared hash") |
| 243 | os.chmod(obj_file, 0o444) |
| 244 | |
| 245 | dest = tmp_path / "should-not-exist.bin" |
| 246 | with pytest.raises(OSError, match="failed SHA-256 integrity check"): |
| 247 | restore_object(repo, oid, dest) |
| 248 | assert not dest.exists(), "No corrupted data must reach the working tree" |
| 249 | |
| 250 | def test_restore_dest_is_writable(self, tmp_path: pathlib.Path) -> None: |
| 251 | """Restored files must be writable (0o444 object mode must not propagate).""" |
| 252 | repo = _make_repo(tmp_path) |
| 253 | content = b"editable file" |
| 254 | oid = _stored_object(repo, content) |
| 255 | dest = tmp_path / "editable.txt" |
| 256 | restore_object(repo, oid, dest) |
| 257 | # Should be writable by owner. |
| 258 | dest.write_bytes(b"new content") # must not raise PermissionError |
| 259 | |
| 260 | def test_restore_is_atomic(self, tmp_path: pathlib.Path) -> None: |
| 261 | """A concurrent reader never sees a partial restore.""" |
| 262 | repo = _make_repo(tmp_path) |
| 263 | content = b"atomic restore test " + b"x" * 1000 |
| 264 | oid = _stored_object(repo, content) |
| 265 | dest = tmp_path / "atomic.bin" |
| 266 | restore_object(repo, oid, dest) |
| 267 | assert dest.read_bytes() == content |
| 268 | |
| 269 | |
| 270 | # --------------------------------------------------------------------------- |
| 271 | # 4 & 5. apply_mpack — pack-bomb and per-object size cap |
| 272 | # --------------------------------------------------------------------------- |
| 273 | |
| 274 | |
| 275 | class TestApplyMPackBomb: |
| 276 | def _build_mpack( |
| 277 | self, |
| 278 | *, |
| 279 | n_objects: int = 0, |
| 280 | n_snapshots: int = 0, |
| 281 | n_commits: int = 0, |
| 282 | object_size: int = 1, |
| 283 | ) -> MPack: |
| 284 | objects = [] |
| 285 | for i in range(n_objects): |
| 286 | content = f"object-{i}".encode() + b"\x00" * object_size |
| 287 | oid = blob_id(content) |
| 288 | objects.append({"object_id": oid, "content": content}) |
| 289 | return MPack( |
| 290 | commits=[], |
| 291 | snapshots=[], |
| 292 | blobs=objects, |
| 293 | ) |
| 294 | |
| 295 | def test_pack_at_limit_succeeds(self, tmp_path: pathlib.Path) -> None: |
| 296 | """A pack with exactly MAX_PACK_OBJECTS items (objects + snapshots + commits) is accepted.""" |
| 297 | repo = _make_repo(tmp_path) |
| 298 | # Use a small object count that is within the limit. |
| 299 | n = min(10, MAX_PACK_OBJECTS) |
| 300 | mpack = self._build_mpack(n_objects=n) |
| 301 | result = apply_mpack(repo, mpack) |
| 302 | assert result["blobs_written"] == n |
| 303 | |
| 304 | def test_pack_exceeds_limit_raises(self, tmp_path: pathlib.Path) -> None: |
| 305 | """A pack with total items > MAX_PACK_OBJECTS must be rejected.""" |
| 306 | repo = _make_repo(tmp_path) |
| 307 | # Build a fake mpack that claims MAX_PACK_OBJECTS + 1 items. |
| 308 | # We don't actually need the objects to be real — the count check fires first. |
| 309 | fake_obj = {"object_id": "a" * 64, "content": b"x"} |
| 310 | oversized_bundle: MPack = MPack( |
| 311 | commits=[], |
| 312 | snapshots=[], |
| 313 | blobs=[fake_obj] * (MAX_PACK_OBJECTS + 1), |
| 314 | ) |
| 315 | with pytest.raises(ValueError, match="exceeds the"): |
| 316 | apply_mpack(repo, oversized_bundle) |
| 317 | |
| 318 | def test_oversized_object_in_pack_is_skipped(self, tmp_path: pathlib.Path) -> None: |
| 319 | """An object in the pack that exceeds MAX_OBJECT_WRITE_BYTES is logged and skipped.""" |
| 320 | repo = _make_repo(tmp_path) |
| 321 | big_content = b"B" * (MAX_OBJECT_WRITE_BYTES + 1) |
| 322 | big_oid = blob_id(big_content) |
| 323 | tiny_content = b"tiny object" |
| 324 | tiny_oid = blob_id(tiny_content) |
| 325 | mpack: MPack = MPack( |
| 326 | commits=[], |
| 327 | snapshots=[], |
| 328 | blobs=[ |
| 329 | {"object_id": big_oid, "content": big_content}, |
| 330 | {"object_id": tiny_oid, "content": tiny_content}, |
| 331 | ], |
| 332 | ) |
| 333 | result = apply_mpack(repo, mpack) |
| 334 | # Big object must be skipped, tiny object must be written. |
| 335 | assert not has_object(repo, big_oid), "Oversized object must not be stored" |
| 336 | assert has_object(repo, tiny_oid), "Valid object must be stored" |
| 337 | assert result["blobs_written"] == 1 |
| 338 | |
| 339 | def test_zero_item_pack_is_accepted(self, tmp_path: pathlib.Path) -> None: |
| 340 | repo = _make_repo(tmp_path) |
| 341 | empty: MPack = MPack(commits=[], snapshots=[], blobs=[]) |
| 342 | result = apply_mpack(repo, empty) |
| 343 | assert result == ApplyResult( |
| 344 | commits_written=0, |
| 345 | snapshots_written=0, |
| 346 | blobs_written=0, |
| 347 | blobs_skipped=0, |
| 348 | tags_written=0, |
| 349 | failed_blobs=[], |
| 350 | skipped_snapshots=[], |
| 351 | ) |
| 352 | |
| 353 | |
| 354 | # --------------------------------------------------------------------------- |
| 355 | # 6. apply_mpack — object-ID deduplication |
| 356 | # --------------------------------------------------------------------------- |
| 357 | |
| 358 | |
| 359 | class TestApplyPackDeduplication: |
| 360 | def test_duplicate_object_ids_not_hashed_twice(self, tmp_path: pathlib.Path) -> None: |
| 361 | """Duplicate object IDs in the pack are skipped without re-computing sha256.""" |
| 362 | repo = _make_repo(tmp_path) |
| 363 | content = b"dedup test object" |
| 364 | oid = blob_id(content) |
| 365 | # Send the same object 100 times. |
| 366 | mpack: MPack = MPack( |
| 367 | commits=[], |
| 368 | snapshots=[], |
| 369 | blobs=[{"object_id": oid, "content": content}] * 100, |
| 370 | ) |
| 371 | result = apply_mpack(repo, mpack) |
| 372 | assert result["blobs_written"] == 1 |
| 373 | assert result["blobs_skipped"] == 99 |
| 374 | assert has_object(repo, oid) |
| 375 | |
| 376 | def test_duplicate_then_different_both_processed(self, tmp_path: pathlib.Path) -> None: |
| 377 | repo = _make_repo(tmp_path) |
| 378 | c1 = b"first object" |
| 379 | c2 = b"second object" |
| 380 | o1 = blob_id(c1) |
| 381 | o2 = blob_id(c2) |
| 382 | mpack: MPack = MPack( |
| 383 | commits=[], |
| 384 | snapshots=[], |
| 385 | blobs=[ |
| 386 | {"object_id": o1, "content": c1}, |
| 387 | {"object_id": o1, "content": c1}, # duplicate |
| 388 | {"object_id": o2, "content": c2}, |
| 389 | ], |
| 390 | ) |
| 391 | result = apply_mpack(repo, mpack) |
| 392 | assert result["blobs_written"] == 2 |
| 393 | assert result["blobs_skipped"] == 1 |
| 394 | |
| 395 | |
| 396 | # --------------------------------------------------------------------------- |
| 397 | # 7. apply_mpack — malformed entries are isolated (snapshot / commit) |
| 398 | # --------------------------------------------------------------------------- |
| 399 | |
| 400 | |
| 401 | class TestApplyPackMalformedEntries: |
| 402 | def test_malformed_object_entry_does_not_abort_pack(self, tmp_path: pathlib.Path) -> None: |
| 403 | """A bad object entry is logged and skipped; other entries are still written. |
| 404 | |
| 405 | Note: deduplication means each object_id is only attempted once per |
| 406 | apply_mpack call. Two entries with the same object_id but different |
| 407 | content are impossible in a valid content-addressed store — if the |
| 408 | first attempt fails (hash mismatch or malformed ID), the second |
| 409 | attempt for the same ID is correctly deduplicated. Use distinct IDs |
| 410 | to test that bad entries do not prevent good ones from being written. |
| 411 | """ |
| 412 | repo = _make_repo(tmp_path) |
| 413 | good_content_a = b"good object A" |
| 414 | good_oid_a = blob_id(good_content_a) |
| 415 | good_content_b = b"good object B" |
| 416 | good_oid_b = blob_id(good_content_b) |
| 417 | mpack: MPack = MPack( |
| 418 | commits=[], |
| 419 | snapshots=[], |
| 420 | blobs=[ |
| 421 | {"object_id": "not-hex", "content": b"bad"}, # malformed ID |
| 422 | {"object_id": good_oid_a, "content": b"wrong bytes"}, # hash mismatch |
| 423 | {"object_id": good_oid_b, "content": good_content_b}, # valid different OID |
| 424 | ], |
| 425 | ) |
| 426 | result = apply_mpack(repo, mpack) |
| 427 | assert not has_object(repo, good_oid_a), "Hash-mismatched entry must not be stored" |
| 428 | assert has_object(repo, good_oid_b), "Valid entry after bad ones must be stored" |
| 429 | assert result["blobs_written"] == 1 |
| 430 | |
| 431 | def test_missing_object_id_in_pack_entry_skipped(self, tmp_path: pathlib.Path) -> None: |
| 432 | repo = _make_repo(tmp_path) |
| 433 | mpack: MPack = MPack( |
| 434 | commits=[], |
| 435 | snapshots=[], |
| 436 | blobs=[{"object_id": "", "content": b"anything"}], |
| 437 | ) |
| 438 | result = apply_mpack(repo, mpack) |
| 439 | assert result["blobs_written"] == 0 |
| 440 | |
| 441 | def test_empty_content_in_pack_entry_skipped(self, tmp_path: pathlib.Path) -> None: |
| 442 | """An entry with empty content (b'') and any oid is skipped (not-oid check).""" |
| 443 | repo = _make_repo(tmp_path) |
| 444 | from muse.core.mpack import BlobPayload |
| 445 | # An entry with empty oid and empty content has no oid — should be skipped. |
| 446 | empty_entry = BlobPayload(object_id="", content=b"") |
| 447 | mpack: MPack = MPack(commits=[], snapshots=[], blobs=[empty_entry]) |
| 448 | result = apply_mpack(repo, mpack) |
| 449 | assert result["blobs_written"] == 0 |
| 450 | |
| 451 | |
| 452 | # --------------------------------------------------------------------------- |
| 453 | # 8. read_object — corruption detected on every read |
| 454 | # --------------------------------------------------------------------------- |
| 455 | |
| 456 | |
| 457 | class TestReadObjectIntegrity: |
| 458 | def test_read_clean_object_succeeds(self, tmp_path: pathlib.Path) -> None: |
| 459 | repo = _make_repo(tmp_path) |
| 460 | content = b"clean read test" |
| 461 | oid = _stored_object(repo, content) |
| 462 | assert read_object(repo, oid) == content |
| 463 | |
| 464 | def test_read_corrupted_object_raises(self, tmp_path: pathlib.Path) -> None: |
| 465 | repo = _make_repo(tmp_path) |
| 466 | content = b"will be corrupted" |
| 467 | oid = _stored_object(repo, content) |
| 468 | from muse.core.object_store import _object_path_with_fallback |
| 469 | obj_file = _object_path_with_fallback(repo, oid) |
| 470 | os.chmod(obj_file, 0o644) |
| 471 | obj_file.write_bytes(b"corrupted bytes") |
| 472 | os.chmod(obj_file, 0o444) |
| 473 | with pytest.raises(OSError, match="integrity check"): |
| 474 | read_object(repo, oid) |
| 475 | |
| 476 | def test_read_absent_object_returns_none(self, tmp_path: pathlib.Path) -> None: |
| 477 | repo = _make_repo(tmp_path) |
| 478 | assert read_object(repo, blob_id(b"absent")) is None |
| 479 | |
| 480 | |
| 481 | # --------------------------------------------------------------------------- |
| 482 | # 9. Confirmed: all write_object callsites use content-derived IDs |
| 483 | # --------------------------------------------------------------------------- |
| 484 | |
| 485 | |
| 486 | class TestCallsiteIntegrity: |
| 487 | def test_hash_object_stdin_derives_id_from_content(self, tmp_path: pathlib.Path) -> None: |
| 488 | """hash-object with --write derives object_id from actual stdin bytes.""" |
| 489 | from tests.cli_test_helper import CliRunner |
| 490 | repo = _make_repo(tmp_path) |
| 491 | (config_toml_path(repo)).write_text("[core]\nauthor = \"test\"\n") |
| 492 | content = b"stdin content for hashing" |
| 493 | expected_oid = blob_id(content) |
| 494 | runner = CliRunner() |
| 495 | result = runner.invoke( |
| 496 | None, |
| 497 | ["hash-object", "--stdin", "--write"], |
| 498 | input=content, |
| 499 | env={"MUSE_REPO_ROOT": str(repo)}, |
| 500 | ) |
| 501 | assert result.exit_code == 0, result.output |
| 502 | assert expected_oid in result.output |
| 503 | assert has_object(repo, expected_oid) |
| 504 | |
| 505 | def test_hash_object_file_derives_id_from_file_content(self, tmp_path: pathlib.Path) -> None: |
| 506 | """hash-object with a file path derives object_id from actual file bytes.""" |
| 507 | from tests.cli_test_helper import CliRunner |
| 508 | repo = _make_repo(tmp_path) |
| 509 | (config_toml_path(repo)).write_text("[core]\nauthor = \"test\"\n") |
| 510 | content = b"file content for hashing" |
| 511 | target = tmp_path / "target.bin" |
| 512 | target.write_bytes(content) |
| 513 | expected_oid = blob_id(content) |
| 514 | runner = CliRunner() |
| 515 | result = runner.invoke( |
| 516 | None, |
| 517 | ["hash-object", str(target), "--write"], |
| 518 | env={"MUSE_REPO_ROOT": str(repo)}, |
| 519 | ) |
| 520 | assert result.exit_code == 0, result.output |
| 521 | assert expected_oid in result.output |
| 522 | assert has_object(repo, expected_oid) |
| 523 | |
| 524 | def test_unpack_objects_hash_mismatch_rejected(self, tmp_path: pathlib.Path) -> None: |
| 525 | """muse unpack-objects rejects a pack object with wrong hash.""" |
| 526 | from tests.cli_test_helper import CliRunner |
| 527 | repo = _make_repo(tmp_path) |
| 528 | (config_toml_path(repo)).write_text("[core]\nauthor = \"test\"\n") |
| 529 | legit_content = b"legitimate" |
| 530 | legit_oid = blob_id(legit_content) |
| 531 | |
| 532 | # apply_mpack directly to test the core logic. |
| 533 | mpack: MPack = MPack( |
| 534 | commits=[], snapshots=[], |
| 535 | blobs=[{"object_id": legit_oid, "content": b"malicious bytes"}], |
| 536 | ) |
| 537 | result = apply_mpack(repo, mpack) |
| 538 | # The poisoned object should be skipped (hash mismatch caught by write_object). |
| 539 | assert not has_object(repo, legit_oid), "Poisoned object must not enter the store" |
| 540 | assert result["blobs_written"] == 0 |
| 541 | |
| 542 | |
| 543 | # --------------------------------------------------------------------------- |
| 544 | # 10. Stress: 10 000-object pack processed within time budget |
| 545 | # --------------------------------------------------------------------------- |
| 546 | |
| 547 | |
| 548 | class TestStress: |
| 549 | @pytest.fixture(autouse=True) |
| 550 | def no_fsync(self) -> None: |
| 551 | """Mock fsync so the budget test measures algorithmic cost, not I/O latency.""" |
| 552 | with patch("muse.core.object_store._fsync_fd", return_value=None), \ |
| 553 | patch("muse.core.commits.os.fsync", return_value=None), \ |
| 554 | patch("muse.core.io.os.fsync", return_value=None), \ |
| 555 | patch("muse.core.io.fcntl.fcntl", return_value=0): |
| 556 | yield |
| 557 | |
| 558 | @pytest.mark.perf |
| 559 | def test_10k_object_pack_within_budget(self, tmp_path: pathlib.Path) -> None: |
| 560 | """10 000 unique objects written through apply_mpack in under 30 seconds.""" |
| 561 | repo = _make_repo(tmp_path) |
| 562 | n = 10_000 |
| 563 | objects = [] |
| 564 | for i in range(n): |
| 565 | content = f"stress-object-{i:06d}".encode() |
| 566 | oid = blob_id(content) |
| 567 | objects.append({"object_id": oid, "content": content}) |
| 568 | |
| 569 | mpack: MPack = MPack(commits=[], snapshots=[], blobs=objects) |
| 570 | start = time.monotonic() |
| 571 | result = apply_mpack(repo, mpack) |
| 572 | elapsed = time.monotonic() - start |
| 573 | |
| 574 | assert result["blobs_written"] == n |
| 575 | assert elapsed < 30.0, f"10k-object pack took {elapsed:.1f}s — too slow" |
| 576 | |
| 577 | def test_idempotent_10k_pack_fast(self, tmp_path: pathlib.Path) -> None: |
| 578 | """Re-applying the same 10k pack is faster (all objects already present).""" |
| 579 | repo = _make_repo(tmp_path) |
| 580 | n = 1_000 # smaller for the idempotency test |
| 581 | objects = [] |
| 582 | for i in range(n): |
| 583 | content = f"idem-object-{i:06d}".encode() |
| 584 | oid = blob_id(content) |
| 585 | objects.append({"object_id": oid, "content": content}) |
| 586 | |
| 587 | mpack: MPack = MPack(commits=[], snapshots=[], blobs=objects) |
| 588 | apply_mpack(repo, mpack) # first application |
| 589 | result2 = apply_mpack(repo, mpack) # second application |
| 590 | assert result2["blobs_written"] == 0 |
| 591 | assert result2["blobs_skipped"] == n |
| 592 | |
| 593 | def test_10k_duplicate_ids_deduplicated(self, tmp_path: pathlib.Path) -> None: |
| 594 | """10 000 entries with the same object_id are deduplicated to one write.""" |
| 595 | repo = _make_repo(tmp_path) |
| 596 | content = b"one true object" |
| 597 | oid = blob_id(content) |
| 598 | mpack: MPack = MPack( |
| 599 | commits=[], |
| 600 | snapshots=[], |
| 601 | blobs=[{"object_id": oid, "content": content}] * 10_000, |
| 602 | ) |
| 603 | result = apply_mpack(repo, mpack) |
| 604 | assert result["blobs_written"] == 1 |
| 605 | assert result["blobs_skipped"] == 9_999 |
| 606 | |
| 607 | |
| 608 | # --------------------------------------------------------------------------- |
| 609 | # 11. Concurrent poisoning stress |
| 610 | # --------------------------------------------------------------------------- |
| 611 | |
| 612 | |
| 613 | class TestConcurrentPoisoning: |
| 614 | def test_concurrent_hash_mismatch_attempts_do_not_corrupt( |
| 615 | self, tmp_path: pathlib.Path |
| 616 | ) -> None: |
| 617 | """50 threads simultaneously trying to poison the store — none succeeds.""" |
| 618 | repo = _make_repo(tmp_path) |
| 619 | legit_content = b"the one true content" |
| 620 | legit_oid = blob_id(legit_content) |
| 621 | |
| 622 | # Write the legitimate object first. |
| 623 | write_object(repo, legit_oid, legit_content) |
| 624 | |
| 625 | errors: list[str] = [] |
| 626 | |
| 627 | def poison_attempt(idx: int) -> None: |
| 628 | malicious_content = f"malicious-{idx}".encode() |
| 629 | try: |
| 630 | write_object(repo, legit_oid, malicious_content) |
| 631 | errors.append(f"Thread {idx}: poisoning succeeded!") |
| 632 | except ValueError: |
| 633 | pass # expected |
| 634 | |
| 635 | threads = [threading.Thread(target=poison_attempt, args=(i,)) for i in range(50)] |
| 636 | for t in threads: |
| 637 | t.start() |
| 638 | for t in threads: |
| 639 | t.join(timeout=5.0) |
| 640 | |
| 641 | assert not errors, "\n".join(errors) |
| 642 | # The stored object must still be the legitimate one. |
| 643 | assert read_object(repo, legit_oid) == legit_content |
| 644 | |
| 645 | def test_concurrent_writes_of_same_object_idempotent( |
| 646 | self, tmp_path: pathlib.Path |
| 647 | ) -> None: |
| 648 | """50 threads writing the same valid object — exactly one write, no corruption.""" |
| 649 | repo = _make_repo(tmp_path) |
| 650 | content = b"concurrent valid object" |
| 651 | oid = blob_id(content) |
| 652 | results: list[bool] = [] |
| 653 | lock = threading.Lock() |
| 654 | |
| 655 | def write_it() -> None: |
| 656 | wrote = write_object(repo, oid, content) |
| 657 | with lock: |
| 658 | results.append(wrote) |
| 659 | |
| 660 | threads = [threading.Thread(target=write_it) for _ in range(50)] |
| 661 | for t in threads: |
| 662 | t.start() |
| 663 | for t in threads: |
| 664 | t.join(timeout=5.0) |
| 665 | |
| 666 | assert results.count(True) >= 1, "At least one thread must have written" |
| 667 | assert read_object(repo, oid) == content |
| 668 | |
| 669 | |
| 670 | # --------------------------------------------------------------------------- |
| 671 | # 12. SHA-256 threat model documentation test |
| 672 | # --------------------------------------------------------------------------- |
| 673 | |
| 674 | |
| 675 | class TestSHA256ThreatModel: |
| 676 | def test_sha256_preimage_resistance_documented(self) -> None: |
| 677 | """Document that SHA-256 preimage resistance is the security boundary. |
| 678 | |
| 679 | Muse's object store is secure against hash-mismatch injection because: |
| 680 | 1. write_object computes sha256(content) and rejects any mismatch. |
| 681 | 2. read_object recomputes sha256 on every read. |
| 682 | 3. restore_object recomputes sha256 before copying to working tree. |
| 683 | |
| 684 | A successful poisoning attack would require finding a second preimage: |
| 685 | a different content M' such that sha256(M') == sha256(M). |
| 686 | |
| 687 | As of 2026, the best known second-preimage attack on SHA-256 requires |
| 688 | 2^256 operations — computationally infeasible for any adversary. |
| 689 | |
| 690 | This test is a living specification of the threat model, not a |
| 691 | cryptographic proof. It verifies the code paths enforce the model. |
| 692 | """ |
| 693 | content_a = b"message A" |
| 694 | content_b = b"message B" |
| 695 | # Two different messages must have different SHA-256 digests. |
| 696 | # (With overwhelming probability — hash collision is computationally |
| 697 | # infeasible but not theoretically impossible.) |
| 698 | assert blob_id(content_a) != blob_id(content_b) |
| 699 | |
| 700 | def test_write_then_read_roundtrip_preserves_content( |
| 701 | self, tmp_path: pathlib.Path |
| 702 | ) -> None: |
| 703 | """Content written to the store is always returned verbatim on read.""" |
| 704 | repo = _make_repo(tmp_path) |
| 705 | for i in range(20): |
| 706 | content = f"stress-content-{i}".encode() * (i + 1) |
| 707 | oid = blob_id(content) |
| 708 | write_object(repo, oid, content) |
| 709 | assert read_object(repo, oid) == content |
| 710 | |
| 711 | def test_object_mode_is_immutable(self, tmp_path: pathlib.Path) -> None: |
| 712 | """Stored objects have mode 0o444 — expressing immutability at OS level.""" |
| 713 | repo = _make_repo(tmp_path) |
| 714 | content = b"immutable object" |
| 715 | oid = _stored_object(repo, content) |
| 716 | from muse.core.object_store import _object_path_with_fallback |
| 717 | obj_file = _object_path_with_fallback(repo, oid) |
| 718 | mode = oct(obj_file.stat().st_mode & 0o777) |
| 719 | assert mode == oct(0o444), f"Expected 0o444, got {mode}" |
| 720 | |
| 721 | |
| 722 | class TestWriteObjectFromPathRoundTrip: |
| 723 | """write_object_from_path must produce objects readable by read_object.""" |
| 724 | |
| 725 | def test_read_returns_exact_content(self, tmp_path: pathlib.Path) -> None: |
| 726 | """read_object after write_object_from_path returns the original bytes.""" |
| 727 | repo = _make_repo(tmp_path) |
| 728 | content = b"hello world, this is a blob" |
| 729 | src = tmp_path / "blob.txt" |
| 730 | src.write_bytes(content) |
| 731 | oid = blob_id(content) |
| 732 | write_object_from_path(repo, oid, src) |
| 733 | assert read_object(repo, oid) == content |
| 734 | |
| 735 | def test_write_from_path_and_write_object_are_equivalent( |
| 736 | self, tmp_path: pathlib.Path |
| 737 | ) -> None: |
| 738 | """write_object_from_path produces the same result as write_object.""" |
| 739 | (tmp_path / "r1").mkdir() |
| 740 | (tmp_path / "r2").mkdir() |
| 741 | repo1 = _make_repo(tmp_path / "r1") |
| 742 | repo2 = _make_repo(tmp_path / "r2") |
| 743 | content = b"equivalent content" |
| 744 | src = tmp_path / "src.bin" |
| 745 | src.write_bytes(content) |
| 746 | oid = blob_id(content) |
| 747 | write_object(repo1, oid, content) |
| 748 | write_object_from_path(repo2, oid, src) |
| 749 | assert read_object(repo1, oid) == read_object(repo2, oid) == content |
| 750 | |
| 751 | def test_get_all_commits_does_not_flag_blob_as_corrupt( |
| 752 | self, tmp_path: pathlib.Path |
| 753 | ) -> None: |
| 754 | """Blobs written via write_object_from_path must not appear as corrupt in commit scans.""" |
| 755 | from muse.core.object_store import objects_dir |
| 756 | repo = _make_repo(tmp_path) |
| 757 | content = b"I am a Python source file\ndef foo(): pass\n" |
| 758 | src = tmp_path / "foo.py" |
| 759 | src.write_bytes(content) |
| 760 | oid = blob_id(content) |
| 761 | write_object_from_path(repo, oid, src) |
| 762 | obj_dir = objects_dir(repo) |
| 763 | stored_path = next(obj_dir.glob("sha256/*/*"), None) |
| 764 | assert stored_path is not None |
| 765 | assert stored_path.read_bytes().startswith(b"blob "), ( |
| 766 | "Stored object must begin with 'blob ' header" |
| 767 | ) |
| 768 | |
| 769 | def test_bare_objects_readable_after_migration( |
| 770 | self, tmp_path: pathlib.Path |
| 771 | ) -> None: |
| 772 | """read_object can recover bare (no-header) objects written by old code.""" |
| 773 | from muse.core.object_store import object_path |
| 774 | repo = _make_repo(tmp_path) |
| 775 | content = b"legacy blob without header" |
| 776 | oid = blob_id(content) |
| 777 | dest = object_path(repo, oid) |
| 778 | dest.parent.mkdir(parents=True, exist_ok=True) |
| 779 | dest.write_bytes(content) |
| 780 | dest.chmod(0o444) |
| 781 | assert read_object(repo, oid) == content |
File History
1 commit
sha256:fe844c2411edd1cec3d4c847f36a96c6ccd4e3d7d1a715106d2ecd64216bf94f
fix: bare object detection and read recovery; rm adapter files
Sonnet 4.6
minor
⚠
3 days ago