gabriel / muse public
test_integrity_I4_msgpack_size.py python
760 lines 30.6 KB
Raw
sha256:81ae324db5ad375fbfe4834c6fcb378312cafad3cc92dec5d3e5c427306621a2 fix: remove commit_exists filter from have anchors — server… Sonnet 4.6 patch 21 days ago
1 """I-4: Store file size limit — prevent OOM from oversized store files.
2
3 Problem (pre-fix): ``_read_msgpack`` called ``path.read_bytes()`` with no
4 size guard. A 10 GiB corrupt or adversarially crafted store file would
5 allocate 10 GiB of RAM, crashing the process or triggering the OOM killer
6 — a critical data-integrity and availability failure.
7
8 ``read_object`` in the object store already had a 256 MiB cap. The commit,
9 snapshot, tag, release, shelf, and index stores did not.
10
11 Fix: added to both ``muse/core/store.py`` and ``muse/core/indices.py``:
12
13 1. ``MAX_MSGPACK_BYTES = 64 MiB`` — ``stat().st_size`` is checked *before*
14 ``read_bytes()`` so no allocation ever occurs. The constant name is
15 legacy; it also guards the new JSON/git-header store files.
16 2. Per-value limits on msgpack wire reads — ``max_str_len``,
17 ``max_bin_len``, ``max_array_len``, ``max_map_len`` — prevent deeply
18 nested or pathologically large single-value documents from consuming
19 unbounded memory even within the size cap.
20
21 This file proves every aspect of the fix:
22
23 Tier 0 — constant export
24 Low-level — stat check before read (OOM prevention)
25 High-level — per-value unpack limits
26 Tier 3 — all high-level read functions (read_commit, read_snapshot, …)
27 Tier 4 — index file protection
28 Tier 5 — CLI command (clean JSON error, no traceback)
29 Tier 6 — boundary / exact-limit behaviour
30 Tier 7 — performance (size check adds < 1 ms overhead)
31 Tier 8 — warning log on oversized file
32 """
33 from __future__ import annotations
34
35 import datetime
36 import logging
37 import pathlib
38 import time
39 from unittest.mock import patch, MagicMock
40
41 import msgpack
42 import pytest
43
44 from muse.core.ids import hash_commit as compute_commit_id, hash_snapshot as compute_snapshot_id
45 from muse.core.object_store import object_path as _obj_path
46 from muse.core.io import MAX_MSGPACK_BYTES
47 from muse.core.types import MsgpackValue
48 from muse.core.commits import (
49 CommitRecord,
50 read_commit,
51 write_commit,
52 )
53 from muse.core.snapshots import (
54 SnapshotRecord,
55 read_snapshot,
56 write_snapshot,
57 )
58 from muse.core.tags import (
59 TagRecord,
60 get_all_tags,
61 write_tag,
62 )
63 from muse.core.releases import list_releases
64
65 from muse.core.types import Manifest, MsgpackDict, fake_id
66 from muse.core.indices import (
67 load_symbol_history,
68 load_hash_occurrence,
69 )
70 from muse.core.paths import commits_dir, indices_dir, muse_dir, releases_dir, snapshots_dir
71
72
73 # ---------------------------------------------------------------------------
74 # Helpers
75 # ---------------------------------------------------------------------------
76
77 _REPO_ID = fake_id("test-repo")
78
79
80 def _repo(tmp_path: pathlib.Path) -> pathlib.Path:
81 muse = muse_dir(tmp_path)
82 (muse / "commits").mkdir(parents=True)
83 (muse / "snapshots").mkdir()
84 (muse / "tags").mkdir()
85 (muse / "releases").mkdir()
86 (muse / "indices").mkdir()
87 (muse / "refs" / "heads").mkdir(parents=True)
88 (muse / "HEAD").write_text("ref: refs/heads/main\n")
89 (muse / "repo.json").write_text(f'{{"repo_id": "{_REPO_ID}"}}\n')
90 return tmp_path
91
92
93 def _commit(idx: int = 0) -> CommitRecord:
94 snapshot_id = compute_snapshot_id({})
95 committed_at = datetime.datetime(2026, 1, 1, tzinfo=datetime.timezone.utc)
96 message = f"commit {idx}"
97 commit_id = compute_commit_id(
98 parent_ids=[],
99 snapshot_id=snapshot_id,
100 message=message,
101 committed_at_iso=committed_at.isoformat(),
102 author="tester",
103 )
104 return CommitRecord(
105 commit_id=commit_id,
106 branch="main",
107 snapshot_id=snapshot_id,
108 message=message,
109 committed_at=committed_at,
110 author="tester",
111 parent_commit_id=None,
112 parent2_commit_id=None,
113 )
114
115
116 def _snapshot(idx: int = 0) -> SnapshotRecord:
117 manifest: Manifest = {f"__idx__": fake_id(f"snap-{idx}")}
118 sid = compute_snapshot_id(manifest)
119 return SnapshotRecord(
120 snapshot_id=sid,
121 manifest=manifest,
122 )
123
124
125 def _tag(idx: int = 0) -> TagRecord:
126 return TagRecord(
127 repo_id=_REPO_ID,
128 tag_id=fake_id(f"tag-id-{idx}"),
129 commit_id=fake_id(f"tag-commit-{idx}"),
130 tag=f"v{idx}.0.0",
131 )
132
133
134 # ---------------------------------------------------------------------------
135 # Tier 0 — constant export
136 # ---------------------------------------------------------------------------
137
138 class TestConstantExport:
139 """MAX_MSGPACK_BYTES must be importable and have the correct value.
140
141 The constant name is legacy (predates the JSON migration); it also guards
142 the new git-header+JSON store files and legacy shelf .msgpack files.
143 """
144
145 def test_max_msgpack_bytes_is_exported(self) -> None:
146 from muse.core.io import MAX_MSGPACK_BYTES as cap
147 assert cap == 64 * 1024 * 1024, (
148 f"Expected 64 MiB (67108864), got {cap}"
149 )
150
151 def test_max_msgpack_bytes_is_int(self) -> None:
152 assert isinstance(MAX_MSGPACK_BYTES, int)
153
154 def test_max_msgpack_bytes_less_than_256mib(self) -> None:
155 """Store records should be capped well below 256 MiB."""
156 assert MAX_MSGPACK_BYTES < 256 * 1024 * 1024, (
157 "Store records should be capped below the object store's 256 MiB limit"
158 )
159
160
161 # ---------------------------------------------------------------------------
162 # Low-level — stat check fires BEFORE read_bytes (the OOM prevention)
163 # ---------------------------------------------------------------------------
164
165 class TestStatCheckBeforeRead:
166 """The size guard must fire before any read_bytes() call.
167
168 We prove this by mocking stat to report an oversized file while keeping
169 the actual file tiny — if read_bytes() were called first, we would NOT
170 trigger the OSError from the stat check.
171 """
172
173 def _oversized_stat(self, real_path: pathlib.Path) -> MagicMock:
174 """Return a MagicMock that reports st_size = MAX_MSGPACK_BYTES + 1."""
175 stat_result = MagicMock()
176 stat_result.st_size = MAX_MSGPACK_BYTES + 1
177 return stat_result
178
179 def test_read_commit_corrupt_object_returns_none(
180 self, tmp_path: pathlib.Path
181 ) -> None:
182 """Commit object store file with corrupt content causes read_commit to return None.
183
184 The stat-before-read guard existed in the old msgpack store; in the unified
185 object store, any corrupt/unreadable content causes graceful failure.
186 """
187 root = _repo(tmp_path)
188 c = _commit(0)
189 write_commit(root, c)
190 # Overwrite the object file with garbage — no valid muse object header
191 _obj_path(root, c.commit_id).write_bytes(b"not-valid-content")
192 result = read_commit(root, c.commit_id)
193 assert result is None, "read_commit must return None for corrupt object"
194
195 def test_read_snapshot_corrupt_object_returns_none(
196 self, tmp_path: pathlib.Path
197 ) -> None:
198 """Snapshot object store file with corrupt content causes read_snapshot to return None."""
199 root = _repo(tmp_path)
200 s = _snapshot(0)
201 write_snapshot(root, s)
202 _obj_path(root, s.snapshot_id).write_bytes(b"not-valid-content")
203 result = read_snapshot(root, s.snapshot_id)
204 assert result is None
205
206
207 # ---------------------------------------------------------------------------
208 # High-level — high-level read functions return None for oversized files
209 # ---------------------------------------------------------------------------
210
211 class TestReadFunctionsReturnNoneOnOversize:
212 """All public read functions must gracefully handle oversized files.
213
214 We patch MAX_MSGPACK_BYTES to a small value so we can create real files
215 that exceed it without writing gigabytes to disk.
216 """
217
218 def test_read_commit_returns_none_for_corrupt_object(
219 self, tmp_path: pathlib.Path
220 ) -> None:
221 """read_commit returns None (not raises) for corrupt object store content.
222
223 The old msgpack-based size limit (MAX_MSGPACK_BYTES) is superseded by the
224 unified object store; any corrupt content triggers graceful failure.
225 """
226 root = _repo(tmp_path)
227 c = _commit(1)
228 write_commit(root, c)
229 # Overwrite with large garbage — no valid muse object header
230 _obj_path(root, c.commit_id).write_bytes(b"\x00" * 200)
231 result = read_commit(root, c.commit_id)
232 assert result is None, "read_commit must return None, not raise, for corrupt object"
233
234 def test_read_snapshot_returns_none_for_corrupt_object(
235 self, tmp_path: pathlib.Path
236 ) -> None:
237 """read_snapshot returns None for corrupt object store content."""
238 root = _repo(tmp_path)
239 s = _snapshot(1)
240 write_snapshot(root, s)
241 _obj_path(root, s.snapshot_id).write_bytes(b"\x00" * 200)
242 result = read_snapshot(root, s.snapshot_id)
243 assert result is None
244
245 def test_get_all_tags_skips_oversized_files(
246 self, tmp_path: pathlib.Path
247 ) -> None:
248 """get_all_tags iterates all tag files — oversized ones are skipped."""
249 root = _repo(tmp_path)
250 good = _tag(0)
251 bad = _tag(1)
252 write_tag(root, good)
253 write_tag(root, bad)
254
255 # A real tag record is ~200 bytes packed (64-char IDs + timestamp).
256 # Choose a limit above a real tag but below our inflated bad file.
257 from muse.core.tags import tag_path
258 good_path = tag_path(root, _REPO_ID, good.tag_id)
259 real_size = good_path.stat().st_size
260 test_limit = real_size * 2 # real tag fits; we'll inflate the bad tag to 3×
261
262 bad_path = tag_path(root, _REPO_ID, bad.tag_id)
263 bad_path.write_bytes(b"\x00" * (real_size * 3)) # definitely exceeds limit
264
265 with patch("muse.core.io.MAX_MSGPACK_BYTES", test_limit):
266 tags = get_all_tags(root, _REPO_ID)
267 tag_ids = {t.tag_id for t in tags}
268 assert good.tag_id in tag_ids, "Good tag was incorrectly dropped"
269 assert bad.tag_id not in tag_ids, "Oversized tag was not skipped"
270
271 def test_list_releases_skips_oversized_files(
272 self, tmp_path: pathlib.Path
273 ) -> None:
274 """list_releases must skip oversized release files."""
275 root = _repo(tmp_path)
276 from muse.core.types import split_id
277 r_algo, r_hex = split_id(_REPO_ID)
278 rel_dir = releases_dir(root) / r_algo / r_hex
279 rel_dir.mkdir(parents=True)
280 # Write a fake oversized release file.
281 fake_release = rel_dir / f"{'a' * 64}.msgpack"
282 fake_release.write_bytes(b"\x00" * 101)
283 with patch("muse.core.io.MAX_MSGPACK_BYTES", 100):
284 results = list_releases(root, _REPO_ID)
285 assert results == [], "Oversized release should be skipped, not crash"
286
287
288 # ---------------------------------------------------------------------------
289 # Tier 3 — exact boundary behaviour
290 # ---------------------------------------------------------------------------
291
292 class TestExactBoundary:
293 """At the boundary: MAX_MSGPACK_BYTES is the last allowed size."""
294
295 def test_file_exactly_at_limit_is_read(self, tmp_path: pathlib.Path) -> None:
296 """A file of exactly MAX_MSGPACK_BYTES bytes passes the size check.
297
298 The content may be unparseable (zeros are not valid msgpack), but the
299 OSError raised is a parse error, not a size-limit error.
300 """
301 test_limit = 256 # small limit for test speed
302 path = tmp_path / "exactly_at_limit.msgpack"
303 path.write_bytes(b"\x00" * test_limit)
304 with patch("muse.core.io.MAX_MSGPACK_BYTES", test_limit):
305 # Should raise a parse error (invalid msgpack), NOT an OSError about size.
306 from muse.core.io import _read_msgpack
307 try:
308 _read_msgpack(path)
309 pytest.fail("Expected an error for invalid msgpack content")
310 except OSError as exc:
311 assert "MiB read limit" not in str(exc), (
312 f"Got size-limit OSError at the boundary — should be parse error: {exc}"
313 )
314 except Exception:
315 pass # Any non-size-limit error is acceptable here
316
317 def test_file_one_byte_over_limit_raises_oslimit_error(
318 self, tmp_path: pathlib.Path
319 ) -> None:
320 """A file of MAX_MSGPACK_BYTES + 1 bytes raises OSError before reading."""
321 test_limit = 256
322 path = tmp_path / "one_over.msgpack"
323 path.write_bytes(b"\x00" * (test_limit + 1))
324 with patch("muse.core.io.MAX_MSGPACK_BYTES", test_limit):
325 from muse.core.io import _read_msgpack
326 with pytest.raises(OSError, match="read limit"):
327 _read_msgpack(path)
328
329 def test_zero_byte_file_does_not_trigger_size_limit(
330 self, tmp_path: pathlib.Path
331 ) -> None:
332 """An empty file passes the size check but fails msgpack parse."""
333 path = tmp_path / "empty.msgpack"
334 path.write_bytes(b"")
335 from muse.core.io import _read_msgpack
336 with pytest.raises(Exception): # parse error, not size error
337 _read_msgpack(path)
338
339 def test_size_limit_error_message_includes_filename_and_limit(
340 self, tmp_path: pathlib.Path
341 ) -> None:
342 """The OSError message must include the file name and limit in MiB."""
343 test_limit = 1024 # 1 KiB for test speed
344 path = tmp_path / "big.msgpack"
345 path.write_bytes(b"\x00" * (test_limit + 1))
346 with patch("muse.core.io.MAX_MSGPACK_BYTES", test_limit):
347 from muse.core.io import _read_msgpack
348 with pytest.raises(OSError) as exc_info:
349 _read_msgpack(path)
350 msg = str(exc_info.value)
351 assert "big.msgpack" in msg, f"Filename missing from error: {msg}"
352 assert "KiB" in msg or "MiB" in msg or "bytes" in msg, (
353 f"Size info missing from error: {msg}"
354 )
355
356
357 # ---------------------------------------------------------------------------
358 # Tier 4 — per-value unpack limits
359 # ---------------------------------------------------------------------------
360
361 class TestPerValueUnpackLimits:
362 """Verify that per-value limits from msgpack.unpackb are enforced."""
363
364 def _pack_to_path(self, tmp_path: pathlib.Path, data: MsgpackValue) -> pathlib.Path:
365 path = tmp_path / "test.msgpack"
366 path.write_bytes(msgpack.packb(data, use_bin_type=True))
367 return path
368
369 def test_string_exceeding_max_str_len_rejected(self, tmp_path: pathlib.Path) -> None:
370 """A string longer than _MSGPACK_MAX_STR_LEN must raise an exception."""
371 huge_str = "x" * 200
372 path = self._pack_to_path(tmp_path, {"key": huge_str})
373 from muse.core.io import _read_msgpack
374 with patch("muse.core.io._MSGPACK_MAX_STR_LEN", 100):
375 with pytest.raises(Exception):
376 _read_msgpack(path)
377
378 def test_string_within_max_str_len_accepted(self, tmp_path: pathlib.Path) -> None:
379 """A string within the limit unpacks normally."""
380 path = self._pack_to_path(tmp_path, {"key": "short"})
381 from muse.core.io import _read_msgpack
382 result = _read_msgpack(path)
383 assert isinstance(result, dict)
384
385 def test_binary_blob_rejected_in_store_records(self, tmp_path: pathlib.Path) -> None:
386 """Binary data (msgpack bin type) must be rejected for store records.
387
388 Commit/snapshot/tag records contain no binary fields. A file with
389 binary data is either corrupt or tampered. max_bin_len=0 ensures
390 this is caught immediately during unpack rather than producing a
391 ``bytes`` value that callers are not prepared to handle.
392 """
393 path = self._pack_to_path(tmp_path, {"body": b"some binary blob"})
394 from muse.core.io import _read_msgpack
395 # max_bin_len=0 means any bin-type value raises an error.
396 with pytest.raises(Exception):
397 _read_msgpack(path)
398
399 def test_map_exceeding_max_map_len_rejected(self, tmp_path: pathlib.Path) -> None:
400 """A map with more than _MSGPACK_MAX_MAP_LEN entries must raise."""
401 big_map: MsgpackDict = {str(i): i for i in range(200)}
402 path = self._pack_to_path(tmp_path, big_map)
403 from muse.core.io import _read_msgpack
404 with patch("muse.core.io._MSGPACK_MAX_MAP_LEN", 100):
405 with pytest.raises(Exception):
406 _read_msgpack(path)
407
408 def test_array_exceeding_max_array_len_rejected(self, tmp_path: pathlib.Path) -> None:
409 """An array with more than _MSGPACK_MAX_ARRAY_LEN entries must raise."""
410 big_list: list[MsgpackValue] = list(range(200))
411 path = self._pack_to_path(tmp_path, big_list)
412 from muse.core.io import _read_msgpack
413 with patch("muse.core.io._MSGPACK_MAX_ARRAY_LEN", 100):
414 with pytest.raises(Exception):
415 _read_msgpack(path)
416
417 def _make_deep_nested_msgpack(self, depth: int) -> bytes:
418 """Build msgpack bytes for a *depth*-deep nested dict without Python recursion.
419
420 ``msgpack.packb`` uses Python-level recursion so packing a 600-deep
421 dict hits the default recursion limit. We build the bytes directly:
422
423 fixmap(1) fixstr("x") fixmap(1) fixstr("x") ... fixmap(0)
424
425 Each level is 3 bytes: ``0x81`` (fixmap 1 entry) + ``0xa1 0x78``
426 (fixstr "x"). The leaf is ``0x80`` (fixmap 0 entries).
427
428 This produces a valid msgpack binary that ``unpackb`` will parse up
429 to its stack limit and then raise ``StackError``.
430 """
431 # 0x81 = fixmap with 1 item; 0xa1 0x78 = fixstr "x"
432 frame = b"\x81\xa1x"
433 leaf = b"\x80" # fixmap with 0 items
434 return frame * depth + leaf
435
436 def test_deeply_nested_map_raises_stack_error(self, tmp_path: pathlib.Path) -> None:
437 """A pathologically nested document hits msgpack's StackError.
438
439 At extreme depth (10 000 levels), msgpack's C-extension stack limit is
440 exceeded and an exception is raised. The file is only ~30 KiB so the
441 size check passes; the protection comes from msgpack's internal stack
442 guard, not the 64 MiB cap.
443 """
444 packed = self._make_deep_nested_msgpack(10_000)
445 path = tmp_path / "deep_nest.msgpack"
446 path.write_bytes(packed)
447 from muse.core.io import _read_msgpack
448 with pytest.raises(Exception): # msgpack.exceptions.StackError
449 _read_msgpack(path)
450
451 def test_deeply_nested_terminates_quickly(self, tmp_path: pathlib.Path) -> None:
452 """The StackError for deeply nested documents is raised in < 1 second."""
453 packed = self._make_deep_nested_msgpack(10_000)
454 path = tmp_path / "deep_nest_perf.msgpack"
455 path.write_bytes(packed)
456 from muse.core.io import _read_msgpack
457 start = time.perf_counter()
458 try:
459 _read_msgpack(path)
460 except Exception:
461 pass
462 elapsed = time.perf_counter() - start
463 assert elapsed < 1.0, (
464 f"Deeply nested document took {elapsed:.3f}s to fail — not fast enough"
465 )
466
467 def test_valid_large_map_within_limits_is_accepted(self, tmp_path: pathlib.Path) -> None:
468 """A large but within-limit map (simulating a 1k-file snapshot) unpacks cleanly."""
469 # Simulate a 1000-file snapshot manifest: {path: object_id}
470 manifest = {f"src/file_{i:04d}.py": fake_id(f"obj-{i}") for i in range(1000)}
471 path = tmp_path / "big_valid.msgpack"
472 path.write_bytes(msgpack.packb(manifest, use_bin_type=True))
473 from muse.core.io import _read_msgpack
474 result = _read_msgpack(path)
475 assert isinstance(result, dict)
476 assert len(result) == 1000
477
478
479 # ---------------------------------------------------------------------------
480 # Tier 5 — index file protection
481 # ---------------------------------------------------------------------------
482
483 class TestIndexReadProtection:
484 """muse/core/indices.py has its own _read_msgpack — must also be protected."""
485
486 def test_load_symbol_history_skips_oversized_index(
487 self, tmp_path: pathlib.Path
488 ) -> None:
489 """An oversized symbol history index returns an empty dict, not OOM."""
490 (indices_dir(tmp_path)).mkdir(parents=True)
491 index_path = indices_dir(tmp_path) / "symbol_history.msgpack"
492 index_path.write_bytes(b"\x00" * 101)
493 with patch("muse.core.indices._MAX_INDEX_BYTES", 100):
494 result = load_symbol_history(tmp_path)
495 assert result == {}, "Oversized index must return empty dict, not crash"
496
497 def test_load_hash_occurrence_skips_oversized_index(
498 self, tmp_path: pathlib.Path
499 ) -> None:
500 """An oversized hash_occurrence index returns an empty dict."""
501 (indices_dir(tmp_path)).mkdir(parents=True)
502 index_path = indices_dir(tmp_path) / "hash_occurrence.msgpack"
503 index_path.write_bytes(b"\x00" * 101)
504 with patch("muse.core.indices._MAX_INDEX_BYTES", 100):
505 result = load_hash_occurrence(tmp_path)
506 assert result == {}
507
508 def test_index_size_limit_is_more_generous_than_store(self) -> None:
509 """Index files are allowed to be larger than store records."""
510 from muse.core.indices import _MAX_INDEX_BYTES
511 assert _MAX_INDEX_BYTES > MAX_MSGPACK_BYTES, (
512 "Index limit should be larger than store limit — indices grow with repo size"
513 )
514
515 def test_index_read_checks_stat_before_read_bytes(
516 self, tmp_path: pathlib.Path
517 ) -> None:
518 """The index stat check must fire before read_bytes (no allocation)."""
519 (indices_dir(tmp_path)).mkdir(parents=True)
520 index_path = indices_dir(tmp_path) / "symbol_history.msgpack"
521 index_path.write_bytes(b"\x85") # 1 byte — well within any size limit
522 read_bytes_called = [False]
523 real_rb = index_path.read_bytes
524
525 def tracking_rb() -> bytes:
526 read_bytes_called[0] = True
527 return real_rb()
528
529 stat_result = MagicMock()
530 stat_result.st_size = 1024 * 1024 * 1024 # 1 GiB — way over limit
531
532 with patch.object(type(index_path), "stat", return_value=stat_result):
533 with patch.object(type(index_path), "read_bytes", tracking_rb):
534 result = load_symbol_history(tmp_path)
535
536 assert result == {}
537 assert not read_bytes_called[0], "read_bytes was called before the stat check!"
538
539
540 # ---------------------------------------------------------------------------
541 # Tier 6 — warning log on oversized file
542 # ---------------------------------------------------------------------------
543
544 class TestWarningLogOnOversizedFile:
545 """Operators need to know when oversized files are detected.
546
547 read_commit / read_snapshot log a WARNING when they catch the OSError
548 from _read_msgpack — this surfaces corruption or tampering in monitoring.
549 """
550
551 def test_warning_logged_for_corrupt_commit(
552 self, tmp_path: pathlib.Path, caplog: pytest.LogCaptureFixture
553 ) -> None:
554 """CRITICAL is logged when a corrupt commit object is detected.
555
556 The old msgpack size-limit guard produced WARNING; the unified object store
557 produces CRITICAL for any corrupt content (consistent with read_commit behavior).
558 """
559 root = _repo(tmp_path)
560 c = _commit(10)
561 write_commit(root, c)
562 _obj_path(root, c.commit_id).write_bytes(b"\x00" * 51)
563 with caplog.at_level(logging.WARNING, logger="muse.core.store"):
564 result = read_commit(root, c.commit_id)
565 assert result is None
566 assert any(
567 "Corrupt" in rec.message or "corrupt" in rec.message
568 for rec in caplog.records
569 ), f"No log for corrupt commit. Records: {[r.message for r in caplog.records]}"
570
571 def test_warning_logged_for_corrupt_snapshot(
572 self, tmp_path: pathlib.Path, caplog: pytest.LogCaptureFixture
573 ) -> None:
574 """CRITICAL is logged when a corrupt snapshot object is detected."""
575 root = _repo(tmp_path)
576 s = _snapshot(10)
577 write_snapshot(root, s)
578 _obj_path(root, s.snapshot_id).write_bytes(b"\x00" * 51)
579 with caplog.at_level(logging.WARNING, logger="muse.core.store"):
580 result = read_snapshot(root, s.snapshot_id)
581 assert result is None
582 assert any(
583 "Corrupt" in rec.message or "corrupt" in rec.message
584 for rec in caplog.records
585 ), f"No log for corrupt snapshot. Records: {[r.message for r in caplog.records]}"
586
587
588 # ---------------------------------------------------------------------------
589 # Tier 7 — CLI: clean JSON error, no traceback
590 # ---------------------------------------------------------------------------
591
592 class TestPlumbingReadCommitOversized:
593 """muse read-commit with an oversized commit file must produce
594 a clean, machine-readable JSON error — no Python traceback, no process crash.
595 """
596
597 def test_corrupt_commit_produces_json_error_not_traceback(
598 self, tmp_path: pathlib.Path
599 ) -> None:
600 """write a commit, corrupt its object store file, run read-commit — must get JSON error."""
601 import json
602 import sys
603 from tests.cli_test_helper import CliRunner
604
605 root = _repo(tmp_path)
606 c = _commit(99)
607 write_commit(root, c)
608
609 # Corrupt the commit object file (unified store).
610 _obj_path(root, c.commit_id).write_bytes(b"\x00" * 101)
611
612 runner = CliRunner()
613 result = runner.invoke(None, ["read-commit", c.commit_id],
614 env={"MUSE_REPO_ROOT": str(root)})
615
616 # Must not crash (exit code may be non-zero, but not a Python traceback).
617 assert "Traceback" not in (result.output or ""), (
618 f"CLI produced a Python traceback for oversized commit:\n{result.output}"
619 )
620 assert "Traceback" not in (result.stderr or ""), (
621 f"CLI stderr has a Python traceback:\n{result.stderr}"
622 )
623 # The error output must be valid JSON (or include a meaningful error).
624 combined = (result.output or "") + (result.stderr or "")
625 try:
626 # Check if any JSON blob exists in the output.
627 for line in combined.splitlines():
628 line = line.strip()
629 if line.startswith("{"):
630 parsed = json.loads(line)
631 assert "error" in parsed, f"JSON lacks 'error' key: {parsed}"
632 break
633 else:
634 # If no JSON line found, at minimum confirm no traceback and
635 # that "not found" or "error" appears in the output.
636 assert (
637 "not found" in combined.lower()
638 or "error" in combined.lower()
639 ), f"No useful error in CLI output:\n{combined}"
640 except json.JSONDecodeError as exc:
641 pytest.fail(f"Output is not valid JSON: {exc}\nOutput:\n{combined}")
642
643
644 # ---------------------------------------------------------------------------
645 # Tier 8 — round-trip: valid files still read correctly
646 # ---------------------------------------------------------------------------
647
648 class TestValidFilesUnaffected:
649 """The size guard must not regress normal reads."""
650
651 def test_read_commit_roundtrip_unaffected(self, tmp_path: pathlib.Path) -> None:
652 root = _repo(tmp_path)
653 c = _commit(42)
654 write_commit(root, c)
655 got = read_commit(root, c.commit_id)
656 assert got is not None
657 assert got.commit_id == c.commit_id
658 assert got.message == c.message
659
660 def test_read_snapshot_roundtrip_unaffected(self, tmp_path: pathlib.Path) -> None:
661 root = _repo(tmp_path)
662 s = _snapshot(42)
663 write_snapshot(root, s)
664 got = read_snapshot(root, s.snapshot_id)
665 assert got is not None
666 assert got.snapshot_id == s.snapshot_id
667
668 def test_snapshot_with_large_manifest_reads_correctly(
669 self, tmp_path: pathlib.Path
670 ) -> None:
671 """A 1000-file snapshot manifest (realistic scale) reads without issue."""
672 root = _repo(tmp_path)
673 manifest = {f"src/file_{i:05d}.py": fake_id(f"obj-{i}") for i in range(1000)}
674 sid = compute_snapshot_id(manifest)
675 s = SnapshotRecord(
676 snapshot_id=sid,
677 manifest=manifest,
678 )
679 write_snapshot(root, s)
680 got = read_snapshot(root, sid)
681 assert got is not None
682 assert len(got.manifest) == 1000
683
684 def test_commit_with_long_message_reads_correctly(
685 self, tmp_path: pathlib.Path
686 ) -> None:
687 """A commit with a 64 KiB message reads correctly (well within 1 MiB str limit)."""
688 root = _repo(tmp_path)
689 long_msg = "a" * 65536
690 committed_at = datetime.datetime(2026, 1, 1, tzinfo=datetime.timezone.utc)
691 snapshot_id = compute_snapshot_id({})
692 cid = compute_commit_id(
693 parent_ids=[],
694 snapshot_id=snapshot_id,
695 message=long_msg,
696 committed_at_iso=committed_at.isoformat(),
697 author="tester",
698 )
699 c = CommitRecord(
700 commit_id=cid,
701 branch="main",
702 snapshot_id=snapshot_id,
703 message=long_msg,
704 committed_at=committed_at,
705 author="tester",
706 parent_commit_id=None,
707 parent2_commit_id=None,
708 )
709 write_commit(root, c)
710 got = read_commit(root, cid)
711 assert got is not None
712 assert len(got.message) == 65536
713
714
715 # ---------------------------------------------------------------------------
716 # Tier 9 — performance: size check adds < 1 ms per read
717 # ---------------------------------------------------------------------------
718
719 class TestSizeCheckPerformance:
720 """The stat() check should add negligible overhead to normal reads."""
721
722 @pytest.mark.perf
723 def test_stat_check_overhead_under_1ms_per_read(
724 self, tmp_path: pathlib.Path
725 ) -> None:
726 """100 sequential read_commit calls with the size guard active < 100ms total."""
727 root = _repo(tmp_path)
728 commits = [_commit(i) for i in range(100)]
729 for c in commits:
730 write_commit(root, c)
731
732 start = time.perf_counter()
733 for c in commits:
734 result = read_commit(root, c.commit_id)
735 assert result is not None
736 elapsed = time.perf_counter() - start
737
738 assert elapsed < 0.1, (
739 f"100 read_commit calls took {elapsed:.3f}s — "
740 "size check is adding too much overhead (< 100ms expected)"
741 )
742
743 @pytest.mark.perf
744 def test_oversized_rejection_under_1ms(self, tmp_path: pathlib.Path) -> None:
745 """Rejecting an oversized file (via stat) takes < 1ms — no disk I/O."""
746 root = _repo(tmp_path)
747 c = _commit(200)
748 write_commit(root, c)
749 path = commits_dir(root) / f"{c.commit_id}.msgpack"
750 path.write_bytes(b"\x00" * 101)
751
752 start = time.perf_counter()
753 with patch("muse.core.io.MAX_MSGPACK_BYTES", 100):
754 for _ in range(1000):
755 read_commit(root, c.commit_id)
756 elapsed = time.perf_counter() - start
757
758 assert elapsed < 1.0, (
759 f"1000 oversized-rejection calls took {elapsed:.3f}s (> 1ms each)"
760 )
File History 4 commits
sha256:81ae324db5ad375fbfe4834c6fcb378312cafad3cc92dec5d3e5c427306621a2 fix: remove commit_exists filter from have anchors — server… Sonnet 4.6 patch 21 days ago
sha256:36c3cb3e76619d4c30a6d9bf81b5ec4ff148e30dcfed913e3114ca7b43b81c7e fix: rename objects→blobs in push client and all stale test… Sonnet 4.6 patch 22 days ago
sha256:c06a9b9b9fee26c68ea725b44d54b2c0a171301ce9de746d5b656617b4463a9a fix: repair four test failures from post-migration audit Sonnet 4.6 patch 28 days ago
sha256:1900655993c83c4107067375548a7be823e471d2515830842f1a12cba4bd3cdf fix: unified object store migration — idempotent writes, JS… Sonnet 4.6 minor 29 days ago