test_integrity_I4_msgpack_size.py
python
sha256:81ae324db5ad375fbfe4834c6fcb378312cafad3cc92dec5d3e5c427306621a2
fix: remove commit_exists filter from have anchors — server…
Sonnet 4.6
patch
21 days ago
| 1 | """I-4: Store file size limit — prevent OOM from oversized store files. |
| 2 | |
| 3 | Problem (pre-fix): ``_read_msgpack`` called ``path.read_bytes()`` with no |
| 4 | size guard. A 10 GiB corrupt or adversarially crafted store file would |
| 5 | allocate 10 GiB of RAM, crashing the process or triggering the OOM killer |
| 6 | — a critical data-integrity and availability failure. |
| 7 | |
| 8 | ``read_object`` in the object store already had a 256 MiB cap. The commit, |
| 9 | snapshot, tag, release, shelf, and index stores did not. |
| 10 | |
| 11 | Fix: added to both ``muse/core/store.py`` and ``muse/core/indices.py``: |
| 12 | |
| 13 | 1. ``MAX_MSGPACK_BYTES = 64 MiB`` — ``stat().st_size`` is checked *before* |
| 14 | ``read_bytes()`` so no allocation ever occurs. The constant name is |
| 15 | legacy; it also guards the new JSON/git-header store files. |
| 16 | 2. Per-value limits on msgpack wire reads — ``max_str_len``, |
| 17 | ``max_bin_len``, ``max_array_len``, ``max_map_len`` — prevent deeply |
| 18 | nested or pathologically large single-value documents from consuming |
| 19 | unbounded memory even within the size cap. |
| 20 | |
| 21 | This file proves every aspect of the fix: |
| 22 | |
| 23 | Tier 0 — constant export |
| 24 | Low-level — stat check before read (OOM prevention) |
| 25 | High-level — per-value unpack limits |
| 26 | Tier 3 — all high-level read functions (read_commit, read_snapshot, …) |
| 27 | Tier 4 — index file protection |
| 28 | Tier 5 — CLI command (clean JSON error, no traceback) |
| 29 | Tier 6 — boundary / exact-limit behaviour |
| 30 | Tier 7 — performance (size check adds < 1 ms overhead) |
| 31 | Tier 8 — warning log on oversized file |
| 32 | """ |
| 33 | from __future__ import annotations |
| 34 | |
| 35 | import datetime |
| 36 | import logging |
| 37 | import pathlib |
| 38 | import time |
| 39 | from unittest.mock import patch, MagicMock |
| 40 | |
| 41 | import msgpack |
| 42 | import pytest |
| 43 | |
| 44 | from muse.core.ids import hash_commit as compute_commit_id, hash_snapshot as compute_snapshot_id |
| 45 | from muse.core.object_store import object_path as _obj_path |
| 46 | from muse.core.io import MAX_MSGPACK_BYTES |
| 47 | from muse.core.types import MsgpackValue |
| 48 | from muse.core.commits import ( |
| 49 | CommitRecord, |
| 50 | read_commit, |
| 51 | write_commit, |
| 52 | ) |
| 53 | from muse.core.snapshots import ( |
| 54 | SnapshotRecord, |
| 55 | read_snapshot, |
| 56 | write_snapshot, |
| 57 | ) |
| 58 | from muse.core.tags import ( |
| 59 | TagRecord, |
| 60 | get_all_tags, |
| 61 | write_tag, |
| 62 | ) |
| 63 | from muse.core.releases import list_releases |
| 64 | |
| 65 | from muse.core.types import Manifest, MsgpackDict, fake_id |
| 66 | from muse.core.indices import ( |
| 67 | load_symbol_history, |
| 68 | load_hash_occurrence, |
| 69 | ) |
| 70 | from muse.core.paths import commits_dir, indices_dir, muse_dir, releases_dir, snapshots_dir |
| 71 | |
| 72 | |
| 73 | # --------------------------------------------------------------------------- |
| 74 | # Helpers |
| 75 | # --------------------------------------------------------------------------- |
| 76 | |
| 77 | _REPO_ID = fake_id("test-repo") |
| 78 | |
| 79 | |
| 80 | def _repo(tmp_path: pathlib.Path) -> pathlib.Path: |
| 81 | muse = muse_dir(tmp_path) |
| 82 | (muse / "commits").mkdir(parents=True) |
| 83 | (muse / "snapshots").mkdir() |
| 84 | (muse / "tags").mkdir() |
| 85 | (muse / "releases").mkdir() |
| 86 | (muse / "indices").mkdir() |
| 87 | (muse / "refs" / "heads").mkdir(parents=True) |
| 88 | (muse / "HEAD").write_text("ref: refs/heads/main\n") |
| 89 | (muse / "repo.json").write_text(f'{{"repo_id": "{_REPO_ID}"}}\n') |
| 90 | return tmp_path |
| 91 | |
| 92 | |
| 93 | def _commit(idx: int = 0) -> CommitRecord: |
| 94 | snapshot_id = compute_snapshot_id({}) |
| 95 | committed_at = datetime.datetime(2026, 1, 1, tzinfo=datetime.timezone.utc) |
| 96 | message = f"commit {idx}" |
| 97 | commit_id = compute_commit_id( |
| 98 | parent_ids=[], |
| 99 | snapshot_id=snapshot_id, |
| 100 | message=message, |
| 101 | committed_at_iso=committed_at.isoformat(), |
| 102 | author="tester", |
| 103 | ) |
| 104 | return CommitRecord( |
| 105 | commit_id=commit_id, |
| 106 | branch="main", |
| 107 | snapshot_id=snapshot_id, |
| 108 | message=message, |
| 109 | committed_at=committed_at, |
| 110 | author="tester", |
| 111 | parent_commit_id=None, |
| 112 | parent2_commit_id=None, |
| 113 | ) |
| 114 | |
| 115 | |
| 116 | def _snapshot(idx: int = 0) -> SnapshotRecord: |
| 117 | manifest: Manifest = {f"__idx__": fake_id(f"snap-{idx}")} |
| 118 | sid = compute_snapshot_id(manifest) |
| 119 | return SnapshotRecord( |
| 120 | snapshot_id=sid, |
| 121 | manifest=manifest, |
| 122 | ) |
| 123 | |
| 124 | |
| 125 | def _tag(idx: int = 0) -> TagRecord: |
| 126 | return TagRecord( |
| 127 | repo_id=_REPO_ID, |
| 128 | tag_id=fake_id(f"tag-id-{idx}"), |
| 129 | commit_id=fake_id(f"tag-commit-{idx}"), |
| 130 | tag=f"v{idx}.0.0", |
| 131 | ) |
| 132 | |
| 133 | |
| 134 | # --------------------------------------------------------------------------- |
| 135 | # Tier 0 — constant export |
| 136 | # --------------------------------------------------------------------------- |
| 137 | |
| 138 | class TestConstantExport: |
| 139 | """MAX_MSGPACK_BYTES must be importable and have the correct value. |
| 140 | |
| 141 | The constant name is legacy (predates the JSON migration); it also guards |
| 142 | the new git-header+JSON store files and legacy shelf .msgpack files. |
| 143 | """ |
| 144 | |
| 145 | def test_max_msgpack_bytes_is_exported(self) -> None: |
| 146 | from muse.core.io import MAX_MSGPACK_BYTES as cap |
| 147 | assert cap == 64 * 1024 * 1024, ( |
| 148 | f"Expected 64 MiB (67108864), got {cap}" |
| 149 | ) |
| 150 | |
| 151 | def test_max_msgpack_bytes_is_int(self) -> None: |
| 152 | assert isinstance(MAX_MSGPACK_BYTES, int) |
| 153 | |
| 154 | def test_max_msgpack_bytes_less_than_256mib(self) -> None: |
| 155 | """Store records should be capped well below 256 MiB.""" |
| 156 | assert MAX_MSGPACK_BYTES < 256 * 1024 * 1024, ( |
| 157 | "Store records should be capped below the object store's 256 MiB limit" |
| 158 | ) |
| 159 | |
| 160 | |
| 161 | # --------------------------------------------------------------------------- |
| 162 | # Low-level — stat check fires BEFORE read_bytes (the OOM prevention) |
| 163 | # --------------------------------------------------------------------------- |
| 164 | |
| 165 | class TestStatCheckBeforeRead: |
| 166 | """The size guard must fire before any read_bytes() call. |
| 167 | |
| 168 | We prove this by mocking stat to report an oversized file while keeping |
| 169 | the actual file tiny — if read_bytes() were called first, we would NOT |
| 170 | trigger the OSError from the stat check. |
| 171 | """ |
| 172 | |
| 173 | def _oversized_stat(self, real_path: pathlib.Path) -> MagicMock: |
| 174 | """Return a MagicMock that reports st_size = MAX_MSGPACK_BYTES + 1.""" |
| 175 | stat_result = MagicMock() |
| 176 | stat_result.st_size = MAX_MSGPACK_BYTES + 1 |
| 177 | return stat_result |
| 178 | |
| 179 | def test_read_commit_corrupt_object_returns_none( |
| 180 | self, tmp_path: pathlib.Path |
| 181 | ) -> None: |
| 182 | """Commit object store file with corrupt content causes read_commit to return None. |
| 183 | |
| 184 | The stat-before-read guard existed in the old msgpack store; in the unified |
| 185 | object store, any corrupt/unreadable content causes graceful failure. |
| 186 | """ |
| 187 | root = _repo(tmp_path) |
| 188 | c = _commit(0) |
| 189 | write_commit(root, c) |
| 190 | # Overwrite the object file with garbage — no valid muse object header |
| 191 | _obj_path(root, c.commit_id).write_bytes(b"not-valid-content") |
| 192 | result = read_commit(root, c.commit_id) |
| 193 | assert result is None, "read_commit must return None for corrupt object" |
| 194 | |
| 195 | def test_read_snapshot_corrupt_object_returns_none( |
| 196 | self, tmp_path: pathlib.Path |
| 197 | ) -> None: |
| 198 | """Snapshot object store file with corrupt content causes read_snapshot to return None.""" |
| 199 | root = _repo(tmp_path) |
| 200 | s = _snapshot(0) |
| 201 | write_snapshot(root, s) |
| 202 | _obj_path(root, s.snapshot_id).write_bytes(b"not-valid-content") |
| 203 | result = read_snapshot(root, s.snapshot_id) |
| 204 | assert result is None |
| 205 | |
| 206 | |
| 207 | # --------------------------------------------------------------------------- |
| 208 | # High-level — high-level read functions return None for oversized files |
| 209 | # --------------------------------------------------------------------------- |
| 210 | |
| 211 | class TestReadFunctionsReturnNoneOnOversize: |
| 212 | """All public read functions must gracefully handle oversized files. |
| 213 | |
| 214 | We patch MAX_MSGPACK_BYTES to a small value so we can create real files |
| 215 | that exceed it without writing gigabytes to disk. |
| 216 | """ |
| 217 | |
| 218 | def test_read_commit_returns_none_for_corrupt_object( |
| 219 | self, tmp_path: pathlib.Path |
| 220 | ) -> None: |
| 221 | """read_commit returns None (not raises) for corrupt object store content. |
| 222 | |
| 223 | The old msgpack-based size limit (MAX_MSGPACK_BYTES) is superseded by the |
| 224 | unified object store; any corrupt content triggers graceful failure. |
| 225 | """ |
| 226 | root = _repo(tmp_path) |
| 227 | c = _commit(1) |
| 228 | write_commit(root, c) |
| 229 | # Overwrite with large garbage — no valid muse object header |
| 230 | _obj_path(root, c.commit_id).write_bytes(b"\x00" * 200) |
| 231 | result = read_commit(root, c.commit_id) |
| 232 | assert result is None, "read_commit must return None, not raise, for corrupt object" |
| 233 | |
| 234 | def test_read_snapshot_returns_none_for_corrupt_object( |
| 235 | self, tmp_path: pathlib.Path |
| 236 | ) -> None: |
| 237 | """read_snapshot returns None for corrupt object store content.""" |
| 238 | root = _repo(tmp_path) |
| 239 | s = _snapshot(1) |
| 240 | write_snapshot(root, s) |
| 241 | _obj_path(root, s.snapshot_id).write_bytes(b"\x00" * 200) |
| 242 | result = read_snapshot(root, s.snapshot_id) |
| 243 | assert result is None |
| 244 | |
| 245 | def test_get_all_tags_skips_oversized_files( |
| 246 | self, tmp_path: pathlib.Path |
| 247 | ) -> None: |
| 248 | """get_all_tags iterates all tag files — oversized ones are skipped.""" |
| 249 | root = _repo(tmp_path) |
| 250 | good = _tag(0) |
| 251 | bad = _tag(1) |
| 252 | write_tag(root, good) |
| 253 | write_tag(root, bad) |
| 254 | |
| 255 | # A real tag record is ~200 bytes packed (64-char IDs + timestamp). |
| 256 | # Choose a limit above a real tag but below our inflated bad file. |
| 257 | from muse.core.tags import tag_path |
| 258 | good_path = tag_path(root, _REPO_ID, good.tag_id) |
| 259 | real_size = good_path.stat().st_size |
| 260 | test_limit = real_size * 2 # real tag fits; we'll inflate the bad tag to 3× |
| 261 | |
| 262 | bad_path = tag_path(root, _REPO_ID, bad.tag_id) |
| 263 | bad_path.write_bytes(b"\x00" * (real_size * 3)) # definitely exceeds limit |
| 264 | |
| 265 | with patch("muse.core.io.MAX_MSGPACK_BYTES", test_limit): |
| 266 | tags = get_all_tags(root, _REPO_ID) |
| 267 | tag_ids = {t.tag_id for t in tags} |
| 268 | assert good.tag_id in tag_ids, "Good tag was incorrectly dropped" |
| 269 | assert bad.tag_id not in tag_ids, "Oversized tag was not skipped" |
| 270 | |
| 271 | def test_list_releases_skips_oversized_files( |
| 272 | self, tmp_path: pathlib.Path |
| 273 | ) -> None: |
| 274 | """list_releases must skip oversized release files.""" |
| 275 | root = _repo(tmp_path) |
| 276 | from muse.core.types import split_id |
| 277 | r_algo, r_hex = split_id(_REPO_ID) |
| 278 | rel_dir = releases_dir(root) / r_algo / r_hex |
| 279 | rel_dir.mkdir(parents=True) |
| 280 | # Write a fake oversized release file. |
| 281 | fake_release = rel_dir / f"{'a' * 64}.msgpack" |
| 282 | fake_release.write_bytes(b"\x00" * 101) |
| 283 | with patch("muse.core.io.MAX_MSGPACK_BYTES", 100): |
| 284 | results = list_releases(root, _REPO_ID) |
| 285 | assert results == [], "Oversized release should be skipped, not crash" |
| 286 | |
| 287 | |
| 288 | # --------------------------------------------------------------------------- |
| 289 | # Tier 3 — exact boundary behaviour |
| 290 | # --------------------------------------------------------------------------- |
| 291 | |
| 292 | class TestExactBoundary: |
| 293 | """At the boundary: MAX_MSGPACK_BYTES is the last allowed size.""" |
| 294 | |
| 295 | def test_file_exactly_at_limit_is_read(self, tmp_path: pathlib.Path) -> None: |
| 296 | """A file of exactly MAX_MSGPACK_BYTES bytes passes the size check. |
| 297 | |
| 298 | The content may be unparseable (zeros are not valid msgpack), but the |
| 299 | OSError raised is a parse error, not a size-limit error. |
| 300 | """ |
| 301 | test_limit = 256 # small limit for test speed |
| 302 | path = tmp_path / "exactly_at_limit.msgpack" |
| 303 | path.write_bytes(b"\x00" * test_limit) |
| 304 | with patch("muse.core.io.MAX_MSGPACK_BYTES", test_limit): |
| 305 | # Should raise a parse error (invalid msgpack), NOT an OSError about size. |
| 306 | from muse.core.io import _read_msgpack |
| 307 | try: |
| 308 | _read_msgpack(path) |
| 309 | pytest.fail("Expected an error for invalid msgpack content") |
| 310 | except OSError as exc: |
| 311 | assert "MiB read limit" not in str(exc), ( |
| 312 | f"Got size-limit OSError at the boundary — should be parse error: {exc}" |
| 313 | ) |
| 314 | except Exception: |
| 315 | pass # Any non-size-limit error is acceptable here |
| 316 | |
| 317 | def test_file_one_byte_over_limit_raises_oslimit_error( |
| 318 | self, tmp_path: pathlib.Path |
| 319 | ) -> None: |
| 320 | """A file of MAX_MSGPACK_BYTES + 1 bytes raises OSError before reading.""" |
| 321 | test_limit = 256 |
| 322 | path = tmp_path / "one_over.msgpack" |
| 323 | path.write_bytes(b"\x00" * (test_limit + 1)) |
| 324 | with patch("muse.core.io.MAX_MSGPACK_BYTES", test_limit): |
| 325 | from muse.core.io import _read_msgpack |
| 326 | with pytest.raises(OSError, match="read limit"): |
| 327 | _read_msgpack(path) |
| 328 | |
| 329 | def test_zero_byte_file_does_not_trigger_size_limit( |
| 330 | self, tmp_path: pathlib.Path |
| 331 | ) -> None: |
| 332 | """An empty file passes the size check but fails msgpack parse.""" |
| 333 | path = tmp_path / "empty.msgpack" |
| 334 | path.write_bytes(b"") |
| 335 | from muse.core.io import _read_msgpack |
| 336 | with pytest.raises(Exception): # parse error, not size error |
| 337 | _read_msgpack(path) |
| 338 | |
| 339 | def test_size_limit_error_message_includes_filename_and_limit( |
| 340 | self, tmp_path: pathlib.Path |
| 341 | ) -> None: |
| 342 | """The OSError message must include the file name and limit in MiB.""" |
| 343 | test_limit = 1024 # 1 KiB for test speed |
| 344 | path = tmp_path / "big.msgpack" |
| 345 | path.write_bytes(b"\x00" * (test_limit + 1)) |
| 346 | with patch("muse.core.io.MAX_MSGPACK_BYTES", test_limit): |
| 347 | from muse.core.io import _read_msgpack |
| 348 | with pytest.raises(OSError) as exc_info: |
| 349 | _read_msgpack(path) |
| 350 | msg = str(exc_info.value) |
| 351 | assert "big.msgpack" in msg, f"Filename missing from error: {msg}" |
| 352 | assert "KiB" in msg or "MiB" in msg or "bytes" in msg, ( |
| 353 | f"Size info missing from error: {msg}" |
| 354 | ) |
| 355 | |
| 356 | |
| 357 | # --------------------------------------------------------------------------- |
| 358 | # Tier 4 — per-value unpack limits |
| 359 | # --------------------------------------------------------------------------- |
| 360 | |
| 361 | class TestPerValueUnpackLimits: |
| 362 | """Verify that per-value limits from msgpack.unpackb are enforced.""" |
| 363 | |
| 364 | def _pack_to_path(self, tmp_path: pathlib.Path, data: MsgpackValue) -> pathlib.Path: |
| 365 | path = tmp_path / "test.msgpack" |
| 366 | path.write_bytes(msgpack.packb(data, use_bin_type=True)) |
| 367 | return path |
| 368 | |
| 369 | def test_string_exceeding_max_str_len_rejected(self, tmp_path: pathlib.Path) -> None: |
| 370 | """A string longer than _MSGPACK_MAX_STR_LEN must raise an exception.""" |
| 371 | huge_str = "x" * 200 |
| 372 | path = self._pack_to_path(tmp_path, {"key": huge_str}) |
| 373 | from muse.core.io import _read_msgpack |
| 374 | with patch("muse.core.io._MSGPACK_MAX_STR_LEN", 100): |
| 375 | with pytest.raises(Exception): |
| 376 | _read_msgpack(path) |
| 377 | |
| 378 | def test_string_within_max_str_len_accepted(self, tmp_path: pathlib.Path) -> None: |
| 379 | """A string within the limit unpacks normally.""" |
| 380 | path = self._pack_to_path(tmp_path, {"key": "short"}) |
| 381 | from muse.core.io import _read_msgpack |
| 382 | result = _read_msgpack(path) |
| 383 | assert isinstance(result, dict) |
| 384 | |
| 385 | def test_binary_blob_rejected_in_store_records(self, tmp_path: pathlib.Path) -> None: |
| 386 | """Binary data (msgpack bin type) must be rejected for store records. |
| 387 | |
| 388 | Commit/snapshot/tag records contain no binary fields. A file with |
| 389 | binary data is either corrupt or tampered. max_bin_len=0 ensures |
| 390 | this is caught immediately during unpack rather than producing a |
| 391 | ``bytes`` value that callers are not prepared to handle. |
| 392 | """ |
| 393 | path = self._pack_to_path(tmp_path, {"body": b"some binary blob"}) |
| 394 | from muse.core.io import _read_msgpack |
| 395 | # max_bin_len=0 means any bin-type value raises an error. |
| 396 | with pytest.raises(Exception): |
| 397 | _read_msgpack(path) |
| 398 | |
| 399 | def test_map_exceeding_max_map_len_rejected(self, tmp_path: pathlib.Path) -> None: |
| 400 | """A map with more than _MSGPACK_MAX_MAP_LEN entries must raise.""" |
| 401 | big_map: MsgpackDict = {str(i): i for i in range(200)} |
| 402 | path = self._pack_to_path(tmp_path, big_map) |
| 403 | from muse.core.io import _read_msgpack |
| 404 | with patch("muse.core.io._MSGPACK_MAX_MAP_LEN", 100): |
| 405 | with pytest.raises(Exception): |
| 406 | _read_msgpack(path) |
| 407 | |
| 408 | def test_array_exceeding_max_array_len_rejected(self, tmp_path: pathlib.Path) -> None: |
| 409 | """An array with more than _MSGPACK_MAX_ARRAY_LEN entries must raise.""" |
| 410 | big_list: list[MsgpackValue] = list(range(200)) |
| 411 | path = self._pack_to_path(tmp_path, big_list) |
| 412 | from muse.core.io import _read_msgpack |
| 413 | with patch("muse.core.io._MSGPACK_MAX_ARRAY_LEN", 100): |
| 414 | with pytest.raises(Exception): |
| 415 | _read_msgpack(path) |
| 416 | |
| 417 | def _make_deep_nested_msgpack(self, depth: int) -> bytes: |
| 418 | """Build msgpack bytes for a *depth*-deep nested dict without Python recursion. |
| 419 | |
| 420 | ``msgpack.packb`` uses Python-level recursion so packing a 600-deep |
| 421 | dict hits the default recursion limit. We build the bytes directly: |
| 422 | |
| 423 | fixmap(1) fixstr("x") fixmap(1) fixstr("x") ... fixmap(0) |
| 424 | |
| 425 | Each level is 3 bytes: ``0x81`` (fixmap 1 entry) + ``0xa1 0x78`` |
| 426 | (fixstr "x"). The leaf is ``0x80`` (fixmap 0 entries). |
| 427 | |
| 428 | This produces a valid msgpack binary that ``unpackb`` will parse up |
| 429 | to its stack limit and then raise ``StackError``. |
| 430 | """ |
| 431 | # 0x81 = fixmap with 1 item; 0xa1 0x78 = fixstr "x" |
| 432 | frame = b"\x81\xa1x" |
| 433 | leaf = b"\x80" # fixmap with 0 items |
| 434 | return frame * depth + leaf |
| 435 | |
| 436 | def test_deeply_nested_map_raises_stack_error(self, tmp_path: pathlib.Path) -> None: |
| 437 | """A pathologically nested document hits msgpack's StackError. |
| 438 | |
| 439 | At extreme depth (10 000 levels), msgpack's C-extension stack limit is |
| 440 | exceeded and an exception is raised. The file is only ~30 KiB so the |
| 441 | size check passes; the protection comes from msgpack's internal stack |
| 442 | guard, not the 64 MiB cap. |
| 443 | """ |
| 444 | packed = self._make_deep_nested_msgpack(10_000) |
| 445 | path = tmp_path / "deep_nest.msgpack" |
| 446 | path.write_bytes(packed) |
| 447 | from muse.core.io import _read_msgpack |
| 448 | with pytest.raises(Exception): # msgpack.exceptions.StackError |
| 449 | _read_msgpack(path) |
| 450 | |
| 451 | def test_deeply_nested_terminates_quickly(self, tmp_path: pathlib.Path) -> None: |
| 452 | """The StackError for deeply nested documents is raised in < 1 second.""" |
| 453 | packed = self._make_deep_nested_msgpack(10_000) |
| 454 | path = tmp_path / "deep_nest_perf.msgpack" |
| 455 | path.write_bytes(packed) |
| 456 | from muse.core.io import _read_msgpack |
| 457 | start = time.perf_counter() |
| 458 | try: |
| 459 | _read_msgpack(path) |
| 460 | except Exception: |
| 461 | pass |
| 462 | elapsed = time.perf_counter() - start |
| 463 | assert elapsed < 1.0, ( |
| 464 | f"Deeply nested document took {elapsed:.3f}s to fail — not fast enough" |
| 465 | ) |
| 466 | |
| 467 | def test_valid_large_map_within_limits_is_accepted(self, tmp_path: pathlib.Path) -> None: |
| 468 | """A large but within-limit map (simulating a 1k-file snapshot) unpacks cleanly.""" |
| 469 | # Simulate a 1000-file snapshot manifest: {path: object_id} |
| 470 | manifest = {f"src/file_{i:04d}.py": fake_id(f"obj-{i}") for i in range(1000)} |
| 471 | path = tmp_path / "big_valid.msgpack" |
| 472 | path.write_bytes(msgpack.packb(manifest, use_bin_type=True)) |
| 473 | from muse.core.io import _read_msgpack |
| 474 | result = _read_msgpack(path) |
| 475 | assert isinstance(result, dict) |
| 476 | assert len(result) == 1000 |
| 477 | |
| 478 | |
| 479 | # --------------------------------------------------------------------------- |
| 480 | # Tier 5 — index file protection |
| 481 | # --------------------------------------------------------------------------- |
| 482 | |
| 483 | class TestIndexReadProtection: |
| 484 | """muse/core/indices.py has its own _read_msgpack — must also be protected.""" |
| 485 | |
| 486 | def test_load_symbol_history_skips_oversized_index( |
| 487 | self, tmp_path: pathlib.Path |
| 488 | ) -> None: |
| 489 | """An oversized symbol history index returns an empty dict, not OOM.""" |
| 490 | (indices_dir(tmp_path)).mkdir(parents=True) |
| 491 | index_path = indices_dir(tmp_path) / "symbol_history.msgpack" |
| 492 | index_path.write_bytes(b"\x00" * 101) |
| 493 | with patch("muse.core.indices._MAX_INDEX_BYTES", 100): |
| 494 | result = load_symbol_history(tmp_path) |
| 495 | assert result == {}, "Oversized index must return empty dict, not crash" |
| 496 | |
| 497 | def test_load_hash_occurrence_skips_oversized_index( |
| 498 | self, tmp_path: pathlib.Path |
| 499 | ) -> None: |
| 500 | """An oversized hash_occurrence index returns an empty dict.""" |
| 501 | (indices_dir(tmp_path)).mkdir(parents=True) |
| 502 | index_path = indices_dir(tmp_path) / "hash_occurrence.msgpack" |
| 503 | index_path.write_bytes(b"\x00" * 101) |
| 504 | with patch("muse.core.indices._MAX_INDEX_BYTES", 100): |
| 505 | result = load_hash_occurrence(tmp_path) |
| 506 | assert result == {} |
| 507 | |
| 508 | def test_index_size_limit_is_more_generous_than_store(self) -> None: |
| 509 | """Index files are allowed to be larger than store records.""" |
| 510 | from muse.core.indices import _MAX_INDEX_BYTES |
| 511 | assert _MAX_INDEX_BYTES > MAX_MSGPACK_BYTES, ( |
| 512 | "Index limit should be larger than store limit — indices grow with repo size" |
| 513 | ) |
| 514 | |
| 515 | def test_index_read_checks_stat_before_read_bytes( |
| 516 | self, tmp_path: pathlib.Path |
| 517 | ) -> None: |
| 518 | """The index stat check must fire before read_bytes (no allocation).""" |
| 519 | (indices_dir(tmp_path)).mkdir(parents=True) |
| 520 | index_path = indices_dir(tmp_path) / "symbol_history.msgpack" |
| 521 | index_path.write_bytes(b"\x85") # 1 byte — well within any size limit |
| 522 | read_bytes_called = [False] |
| 523 | real_rb = index_path.read_bytes |
| 524 | |
| 525 | def tracking_rb() -> bytes: |
| 526 | read_bytes_called[0] = True |
| 527 | return real_rb() |
| 528 | |
| 529 | stat_result = MagicMock() |
| 530 | stat_result.st_size = 1024 * 1024 * 1024 # 1 GiB — way over limit |
| 531 | |
| 532 | with patch.object(type(index_path), "stat", return_value=stat_result): |
| 533 | with patch.object(type(index_path), "read_bytes", tracking_rb): |
| 534 | result = load_symbol_history(tmp_path) |
| 535 | |
| 536 | assert result == {} |
| 537 | assert not read_bytes_called[0], "read_bytes was called before the stat check!" |
| 538 | |
| 539 | |
| 540 | # --------------------------------------------------------------------------- |
| 541 | # Tier 6 — warning log on oversized file |
| 542 | # --------------------------------------------------------------------------- |
| 543 | |
| 544 | class TestWarningLogOnOversizedFile: |
| 545 | """Operators need to know when oversized files are detected. |
| 546 | |
| 547 | read_commit / read_snapshot log a WARNING when they catch the OSError |
| 548 | from _read_msgpack — this surfaces corruption or tampering in monitoring. |
| 549 | """ |
| 550 | |
| 551 | def test_warning_logged_for_corrupt_commit( |
| 552 | self, tmp_path: pathlib.Path, caplog: pytest.LogCaptureFixture |
| 553 | ) -> None: |
| 554 | """CRITICAL is logged when a corrupt commit object is detected. |
| 555 | |
| 556 | The old msgpack size-limit guard produced WARNING; the unified object store |
| 557 | produces CRITICAL for any corrupt content (consistent with read_commit behavior). |
| 558 | """ |
| 559 | root = _repo(tmp_path) |
| 560 | c = _commit(10) |
| 561 | write_commit(root, c) |
| 562 | _obj_path(root, c.commit_id).write_bytes(b"\x00" * 51) |
| 563 | with caplog.at_level(logging.WARNING, logger="muse.core.store"): |
| 564 | result = read_commit(root, c.commit_id) |
| 565 | assert result is None |
| 566 | assert any( |
| 567 | "Corrupt" in rec.message or "corrupt" in rec.message |
| 568 | for rec in caplog.records |
| 569 | ), f"No log for corrupt commit. Records: {[r.message for r in caplog.records]}" |
| 570 | |
| 571 | def test_warning_logged_for_corrupt_snapshot( |
| 572 | self, tmp_path: pathlib.Path, caplog: pytest.LogCaptureFixture |
| 573 | ) -> None: |
| 574 | """CRITICAL is logged when a corrupt snapshot object is detected.""" |
| 575 | root = _repo(tmp_path) |
| 576 | s = _snapshot(10) |
| 577 | write_snapshot(root, s) |
| 578 | _obj_path(root, s.snapshot_id).write_bytes(b"\x00" * 51) |
| 579 | with caplog.at_level(logging.WARNING, logger="muse.core.store"): |
| 580 | result = read_snapshot(root, s.snapshot_id) |
| 581 | assert result is None |
| 582 | assert any( |
| 583 | "Corrupt" in rec.message or "corrupt" in rec.message |
| 584 | for rec in caplog.records |
| 585 | ), f"No log for corrupt snapshot. Records: {[r.message for r in caplog.records]}" |
| 586 | |
| 587 | |
| 588 | # --------------------------------------------------------------------------- |
| 589 | # Tier 7 — CLI: clean JSON error, no traceback |
| 590 | # --------------------------------------------------------------------------- |
| 591 | |
| 592 | class TestPlumbingReadCommitOversized: |
| 593 | """muse read-commit with an oversized commit file must produce |
| 594 | a clean, machine-readable JSON error — no Python traceback, no process crash. |
| 595 | """ |
| 596 | |
| 597 | def test_corrupt_commit_produces_json_error_not_traceback( |
| 598 | self, tmp_path: pathlib.Path |
| 599 | ) -> None: |
| 600 | """write a commit, corrupt its object store file, run read-commit — must get JSON error.""" |
| 601 | import json |
| 602 | import sys |
| 603 | from tests.cli_test_helper import CliRunner |
| 604 | |
| 605 | root = _repo(tmp_path) |
| 606 | c = _commit(99) |
| 607 | write_commit(root, c) |
| 608 | |
| 609 | # Corrupt the commit object file (unified store). |
| 610 | _obj_path(root, c.commit_id).write_bytes(b"\x00" * 101) |
| 611 | |
| 612 | runner = CliRunner() |
| 613 | result = runner.invoke(None, ["read-commit", c.commit_id], |
| 614 | env={"MUSE_REPO_ROOT": str(root)}) |
| 615 | |
| 616 | # Must not crash (exit code may be non-zero, but not a Python traceback). |
| 617 | assert "Traceback" not in (result.output or ""), ( |
| 618 | f"CLI produced a Python traceback for oversized commit:\n{result.output}" |
| 619 | ) |
| 620 | assert "Traceback" not in (result.stderr or ""), ( |
| 621 | f"CLI stderr has a Python traceback:\n{result.stderr}" |
| 622 | ) |
| 623 | # The error output must be valid JSON (or include a meaningful error). |
| 624 | combined = (result.output or "") + (result.stderr or "") |
| 625 | try: |
| 626 | # Check if any JSON blob exists in the output. |
| 627 | for line in combined.splitlines(): |
| 628 | line = line.strip() |
| 629 | if line.startswith("{"): |
| 630 | parsed = json.loads(line) |
| 631 | assert "error" in parsed, f"JSON lacks 'error' key: {parsed}" |
| 632 | break |
| 633 | else: |
| 634 | # If no JSON line found, at minimum confirm no traceback and |
| 635 | # that "not found" or "error" appears in the output. |
| 636 | assert ( |
| 637 | "not found" in combined.lower() |
| 638 | or "error" in combined.lower() |
| 639 | ), f"No useful error in CLI output:\n{combined}" |
| 640 | except json.JSONDecodeError as exc: |
| 641 | pytest.fail(f"Output is not valid JSON: {exc}\nOutput:\n{combined}") |
| 642 | |
| 643 | |
| 644 | # --------------------------------------------------------------------------- |
| 645 | # Tier 8 — round-trip: valid files still read correctly |
| 646 | # --------------------------------------------------------------------------- |
| 647 | |
| 648 | class TestValidFilesUnaffected: |
| 649 | """The size guard must not regress normal reads.""" |
| 650 | |
| 651 | def test_read_commit_roundtrip_unaffected(self, tmp_path: pathlib.Path) -> None: |
| 652 | root = _repo(tmp_path) |
| 653 | c = _commit(42) |
| 654 | write_commit(root, c) |
| 655 | got = read_commit(root, c.commit_id) |
| 656 | assert got is not None |
| 657 | assert got.commit_id == c.commit_id |
| 658 | assert got.message == c.message |
| 659 | |
| 660 | def test_read_snapshot_roundtrip_unaffected(self, tmp_path: pathlib.Path) -> None: |
| 661 | root = _repo(tmp_path) |
| 662 | s = _snapshot(42) |
| 663 | write_snapshot(root, s) |
| 664 | got = read_snapshot(root, s.snapshot_id) |
| 665 | assert got is not None |
| 666 | assert got.snapshot_id == s.snapshot_id |
| 667 | |
| 668 | def test_snapshot_with_large_manifest_reads_correctly( |
| 669 | self, tmp_path: pathlib.Path |
| 670 | ) -> None: |
| 671 | """A 1000-file snapshot manifest (realistic scale) reads without issue.""" |
| 672 | root = _repo(tmp_path) |
| 673 | manifest = {f"src/file_{i:05d}.py": fake_id(f"obj-{i}") for i in range(1000)} |
| 674 | sid = compute_snapshot_id(manifest) |
| 675 | s = SnapshotRecord( |
| 676 | snapshot_id=sid, |
| 677 | manifest=manifest, |
| 678 | ) |
| 679 | write_snapshot(root, s) |
| 680 | got = read_snapshot(root, sid) |
| 681 | assert got is not None |
| 682 | assert len(got.manifest) == 1000 |
| 683 | |
| 684 | def test_commit_with_long_message_reads_correctly( |
| 685 | self, tmp_path: pathlib.Path |
| 686 | ) -> None: |
| 687 | """A commit with a 64 KiB message reads correctly (well within 1 MiB str limit).""" |
| 688 | root = _repo(tmp_path) |
| 689 | long_msg = "a" * 65536 |
| 690 | committed_at = datetime.datetime(2026, 1, 1, tzinfo=datetime.timezone.utc) |
| 691 | snapshot_id = compute_snapshot_id({}) |
| 692 | cid = compute_commit_id( |
| 693 | parent_ids=[], |
| 694 | snapshot_id=snapshot_id, |
| 695 | message=long_msg, |
| 696 | committed_at_iso=committed_at.isoformat(), |
| 697 | author="tester", |
| 698 | ) |
| 699 | c = CommitRecord( |
| 700 | commit_id=cid, |
| 701 | branch="main", |
| 702 | snapshot_id=snapshot_id, |
| 703 | message=long_msg, |
| 704 | committed_at=committed_at, |
| 705 | author="tester", |
| 706 | parent_commit_id=None, |
| 707 | parent2_commit_id=None, |
| 708 | ) |
| 709 | write_commit(root, c) |
| 710 | got = read_commit(root, cid) |
| 711 | assert got is not None |
| 712 | assert len(got.message) == 65536 |
| 713 | |
| 714 | |
| 715 | # --------------------------------------------------------------------------- |
| 716 | # Tier 9 — performance: size check adds < 1 ms per read |
| 717 | # --------------------------------------------------------------------------- |
| 718 | |
| 719 | class TestSizeCheckPerformance: |
| 720 | """The stat() check should add negligible overhead to normal reads.""" |
| 721 | |
| 722 | @pytest.mark.perf |
| 723 | def test_stat_check_overhead_under_1ms_per_read( |
| 724 | self, tmp_path: pathlib.Path |
| 725 | ) -> None: |
| 726 | """100 sequential read_commit calls with the size guard active < 100ms total.""" |
| 727 | root = _repo(tmp_path) |
| 728 | commits = [_commit(i) for i in range(100)] |
| 729 | for c in commits: |
| 730 | write_commit(root, c) |
| 731 | |
| 732 | start = time.perf_counter() |
| 733 | for c in commits: |
| 734 | result = read_commit(root, c.commit_id) |
| 735 | assert result is not None |
| 736 | elapsed = time.perf_counter() - start |
| 737 | |
| 738 | assert elapsed < 0.1, ( |
| 739 | f"100 read_commit calls took {elapsed:.3f}s — " |
| 740 | "size check is adding too much overhead (< 100ms expected)" |
| 741 | ) |
| 742 | |
| 743 | @pytest.mark.perf |
| 744 | def test_oversized_rejection_under_1ms(self, tmp_path: pathlib.Path) -> None: |
| 745 | """Rejecting an oversized file (via stat) takes < 1ms — no disk I/O.""" |
| 746 | root = _repo(tmp_path) |
| 747 | c = _commit(200) |
| 748 | write_commit(root, c) |
| 749 | path = commits_dir(root) / f"{c.commit_id}.msgpack" |
| 750 | path.write_bytes(b"\x00" * 101) |
| 751 | |
| 752 | start = time.perf_counter() |
| 753 | with patch("muse.core.io.MAX_MSGPACK_BYTES", 100): |
| 754 | for _ in range(1000): |
| 755 | read_commit(root, c.commit_id) |
| 756 | elapsed = time.perf_counter() - start |
| 757 | |
| 758 | assert elapsed < 1.0, ( |
| 759 | f"1000 oversized-rejection calls took {elapsed:.3f}s (> 1ms each)" |
| 760 | ) |
File History
4 commits
sha256:81ae324db5ad375fbfe4834c6fcb378312cafad3cc92dec5d3e5c427306621a2
fix: remove commit_exists filter from have anchors — server…
Sonnet 4.6
patch
21 days ago
sha256:36c3cb3e76619d4c30a6d9bf81b5ec4ff148e30dcfed913e3114ca7b43b81c7e
fix: rename objects→blobs in push client and all stale test…
Sonnet 4.6
patch
22 days ago
sha256:c06a9b9b9fee26c68ea725b44d54b2c0a171301ce9de746d5b656617b4463a9a
fix: repair four test failures from post-migration audit
Sonnet 4.6
patch
28 days ago
sha256:1900655993c83c4107067375548a7be823e471d2515830842f1a12cba4bd3cdf
fix: unified object store migration — idempotent writes, JS…
Sonnet 4.6
minor
⚠
29 days ago