gabriel / muse public

test_write_commit_snapshot_hash_verify.py file-level

at sha256:d · View file ↗ · Intel ↗

History
1 files
1 commits
0 hotspots
0 🧊 dead
0 πŸ’₯ blast risk
sha256:4 Merge branch 'dev' into main · gabriel · Jun 17, 2026
1 """
2 Tests for data-integrity behaviour of write_commit / write_snapshot.
3
4 === Current architecture: idempotent writes, detection at read time ===
5
6 write_commit and write_snapshot are both idempotent: if the object
7 already exists at object_path, the call returns immediately without
8 modifying disk. This means:
9
10 - Corruption that lands at object_path is NOT repaired by write_commit
11 or write_snapshot.
12 - Corruption IS detected at read time: read_commit and read_snapshot
13 recompute the hash from stored fields and return None on mismatch.
14
15 === Coverage ===
16
17 Unit β€” write_commit skips clean existing record (no regression)
18 Unit β€” write_commit skips on corrupt object (idempotent)
19 Unit β€” read_commit returns None for corrupt snapshot_id field
20 Unit β€” read_commit returns None for corrupt message field
21 Unit β€” read_commit returns None for corrupt parent_commit_id field
22 Unit β€” write_commit skips on content-level hash mismatch (no OSError)
23 Unit β€” write_snapshot skips clean existing record (no regression)
24 Unit β€” read_snapshot returns None for corrupt manifest
25 Data — parent chain (A→B→C): corrupt B → B unreadable, A/C readable
26 Data β€” corrupting one snapshot does not affect sibling snapshots
27 Security β€” corrupt snapshot_id in commit is rejected at read time
28 Security β€” injected manifest entry is rejected at read time
29 Stress β€” 20 concurrent write_commit calls are all idempotent
30 Stress β€” 20 concurrent write_snapshot calls are all idempotent
31 Stress β€” 50 sequential commits all written and readable
32 Regression β€” write_commit new file works
33 Regression β€” write_snapshot new file works
34 Regression β€” write_commit idempotent on clean file
35 Regression β€” write_snapshot idempotent on clean file
36 """
37 from __future__ import annotations
38
39 import datetime
40 import json as _json
41 import pathlib
42 import threading
43
44 import pytest
45
46 from muse.core.types import Manifest, fake_id
47 from muse.core.object_store import object_path as _obj_path
48 from muse.core.paths import muse_dir
49
50 # ---------------------------------------------------------------------------
51 # Helpers
52 # ---------------------------------------------------------------------------
53
54 _DEFAULT_BLOB = fake_id("default-blob")
55 _DEFAULT_SNAP = fake_id("default-snap")
56 _CorruptField = dict[str, str | int | None]
57
58
59 def _make_repo(tmp_path: pathlib.Path) -> pathlib.Path:
60 (muse_dir(tmp_path) / "objects" / "sha256").mkdir(parents=True, exist_ok=True)
61 return tmp_path
62
63
64 def _ts(year: int = 2024) -> str:
65 return f"{year}-01-01T00:00:00+00:00"
66
67
68 def _good_commit(
69 snapshot_id: str | None = None,
70 message: str = "test commit",
71 parent_commit_id: str | None = None,
72 ts: str | None = None,
73 ) -> "CommitRecord":
74 from muse.core.commits import CommitRecord
75 from muse.core.ids import hash_commit
76
77 snap_id = snapshot_id or _DEFAULT_SNAP
78 timestamp = ts or _ts()
79 parent_ids = [parent_commit_id] if parent_commit_id else []
80 commit_id = hash_commit(
81 parent_ids=parent_ids,
82 snapshot_id=snap_id,
83 message=message,
84 committed_at_iso=timestamp,
85 author="gabriel",
86 )
87 return CommitRecord(
88 commit_id=commit_id,
89 branch="main",
90 snapshot_id=snap_id,
91 message=message,
92 committed_at=datetime.datetime.fromisoformat(timestamp),
93 parent_commit_id=parent_commit_id,
94 parent2_commit_id=None,
95 author="gabriel",
96 metadata={},
97 )
98
99
100 def _good_snapshot(manifest: Manifest | None = None) -> "SnapshotRecord":
101 from muse.core.snapshots import SnapshotRecord
102 from muse.core.ids import hash_snapshot
103
104 m = manifest or {"src/main.py": _DEFAULT_BLOB}
105 snapshot_id = hash_snapshot(m)
106 return SnapshotRecord(snapshot_id=snapshot_id, manifest=m, directories={})
107
108
109 def _write_corrupt_commit(repo: pathlib.Path, good: "CommitRecord", corrupt_field: _CorruptField) -> None:
110 """Write a corrupt commit object to object_path (valid header, wrong field values)."""
111 base = {
112 "commit_id": good.commit_id,
113 "repo_id": "test-repo",
114 "branch": "main",
115 "snapshot_id": good.snapshot_id,
116 "message": good.message,
117 "committed_at": good.committed_at.isoformat(),
118 "parent_commit_id": good.parent_commit_id,
119 "parent2_commit_id": None,
120 "author": "gabriel",
121 "metadata": {},
122 "reviewed_by": [],
123 }
124 base.update(corrupt_field)
125 payload = _json.dumps(base, separators=(",", ":")).encode()
126 path = _obj_path(repo, good.commit_id)
127 path.parent.mkdir(parents=True, exist_ok=True)
128 path.write_bytes(f"commit {len(payload)}\0".encode() + payload)
129
130
131 def _write_corrupt_snapshot(repo: pathlib.Path, good: "SnapshotRecord", corrupt_manifest: Manifest) -> None:
132 """Write a corrupt snapshot object to object_path (valid header, wrong manifest)."""
133 record = {
134 "snapshot_id": good.snapshot_id,
135 "manifest": corrupt_manifest,
136 "directories": {},
137 }
138 payload = _json.dumps(record, separators=(",", ":")).encode()
139 path = _obj_path(repo, good.snapshot_id)
140 path.parent.mkdir(parents=True, exist_ok=True)
141 path.write_bytes(f"snapshot {len(payload)}\0".encode() + payload)
142
143
144 # =============================================================================
145 # 1. UNIT β€” write_commit idempotency and corruption detection
146 # =============================================================================
147
148
149 class TestWriteCommitHashVerification:
150
151 def test_idempotent_skip_clean_record(self, tmp_path: pathlib.Path) -> None:
152 """Regression: write_commit on a clean existing file still returns fast."""
153 from muse.core.commits import (
154 read_commit,
155 write_commit,
156 )
157
158 repo = _make_repo(tmp_path)
159 good = _good_commit()
160 write_commit(repo, good)
161 write_commit(repo, good) # second call: must not raise, must not change data
162 result = read_commit(repo, good.commit_id)
163 assert result is not None
164 assert result.commit_id == good.commit_id
165
166 def test_corrupt_snapshot_id_detected_at_read(self, tmp_path: pathlib.Path) -> None:
167 """
168 A commit object with a corrupt snapshot_id is detected at read time.
169
170 write_commit is idempotent: it skips if object_path exists, so a
171 pre-existing corrupt file is NOT overwritten. read_commit recomputes
172 the hash and returns None when snapshot_id doesn't match commit_id.
173 """
174 from muse.core.commits import (
175 read_commit,
176 write_commit,
177 )
178
179 repo = _make_repo(tmp_path)
180 good = _good_commit()
181 _write_corrupt_commit(repo, good, {"snapshot_id": fake_id("attacker-snapshot")})
182
183 write_commit(repo, good) # skips β€” object already exists
184 result = read_commit(repo, good.commit_id)
185 assert result is None, (
186 "read_commit must detect corrupt snapshot_id via hash verification "
187 "and return None β€” not silently serve corrupt content."
188 )
189
190 def test_corrupt_message_detected_at_read(self, tmp_path: pathlib.Path) -> None:
191 """A commit with a corrupt message is detected at read time."""
192 from muse.core.commits import (
193 read_commit,
194 write_commit,
195 )
196
197 repo = _make_repo(tmp_path)
198 good = _good_commit(message="original message")
199 _write_corrupt_commit(repo, good, {"message": "CORRUPTED MESSAGE"})
200
201 write_commit(repo, good) # skips β€” object already exists
202 result = read_commit(repo, good.commit_id)
203 assert result is None, "read_commit must detect corrupt message via hash verification"
204
205 def test_corrupt_parent_commit_id_detected_at_read(self, tmp_path: pathlib.Path) -> None:
206 """A commit with a corrupt parent_commit_id is detected at read time."""
207 from muse.core.commits import (
208 read_commit,
209 write_commit,
210 )
211
212 repo = _make_repo(tmp_path)
213 good = _good_commit(parent_commit_id=None)
214 _write_corrupt_commit(repo, good, {"parent_commit_id": fake_id("injected-parent")})
215
216 write_commit(repo, good) # skips β€” object already exists
217 result = read_commit(repo, good.commit_id)
218 assert result is None, "read_commit must detect corrupt parent_commit_id via hash verification"
219
220 def test_content_hash_mismatch_skipped_not_raised(self, tmp_path: pathlib.Path) -> None:
221 """
222 write_commit is always idempotent β€” never raises OSError for content-level
223 mismatches. A corrupt object at object_path is silently skipped.
224 Hash mismatches are detected later by read_commit.
225 """
226 from muse.core.commits import (
227 read_commit,
228 write_commit,
229 )
230
231 repo = _make_repo(tmp_path)
232 good_a = _good_commit(message="commit A", ts=_ts(2024))
233 good_b = _good_commit(message="commit B", ts=_ts(2025))
234
235 # Write B's data under A's object_path (commit_id field mismatch)
236 _write_corrupt_commit(repo, good_a, {
237 "commit_id": good_b.commit_id,
238 "message": good_b.message,
239 "snapshot_id": good_b.snapshot_id,
240 })
241
242 # write_commit must NOT raise β€” it skips (idempotent)
243 write_commit(repo, good_a)
244
245 # read_commit detects the hash mismatch and returns None
246 result = read_commit(repo, good_a.commit_id)
247 assert result is None, "read_commit must detect commit_id field mismatch"
248
249
250 # =============================================================================
251 # 2. UNIT β€” write_snapshot idempotency and corruption detection
252 # =============================================================================
253
254
255 class TestWriteSnapshotHashVerification:
256
257 def test_idempotent_skip_clean_snapshot(self, tmp_path: pathlib.Path) -> None:
258 """Regression: write_snapshot on a clean existing file still skips correctly."""
259 from muse.core.snapshots import (
260 read_snapshot,
261 write_snapshot,
262 )
263
264 repo = _make_repo(tmp_path)
265 good = _good_snapshot()
266 write_snapshot(repo, good)
267 write_snapshot(repo, good) # second call: idempotent
268 result = read_snapshot(repo, good.snapshot_id)
269 assert result is not None
270 assert result.snapshot_id == good.snapshot_id
271
272 def test_corrupt_object_id_in_manifest_detected_at_read(self, tmp_path: pathlib.Path) -> None:
273 """
274 A snapshot with a wrong object ID for a file is detected at read time.
275
276 write_snapshot is idempotent: pre-existing corrupt object is skipped.
277 read_snapshot recomputes the manifest hash and returns None on mismatch.
278 """
279 from muse.core.snapshots import (
280 read_snapshot,
281 write_snapshot,
282 )
283
284 repo = _make_repo(tmp_path)
285 blob = fake_id("main-blob")
286 good = _good_snapshot({"src/main.py": blob})
287 _write_corrupt_snapshot(repo, good, {"src/main.py": fake_id("wrong-blob")})
288
289 write_snapshot(repo, good) # skips β€” object already exists
290 result = read_snapshot(repo, good.snapshot_id)
291 assert result is None, "read_snapshot must detect corrupt manifest object ID"
292
293 def test_extra_manifest_entry_detected_at_read(self, tmp_path: pathlib.Path) -> None:
294 """An extra file in the manifest (hash mismatch) is detected at read time."""
295 from muse.core.snapshots import (
296 read_snapshot,
297 write_snapshot,
298 )
299
300 repo = _make_repo(tmp_path)
301 blob = fake_id("main-blob")
302 good = _good_snapshot({"src/main.py": blob})
303 _write_corrupt_snapshot(repo, good, {
304 "src/main.py": blob,
305 "INJECTED_FILE.py": fake_id("injected-blob"),
306 })
307
308 write_snapshot(repo, good) # skips
309 result = read_snapshot(repo, good.snapshot_id)
310 assert result is None, "read_snapshot must detect injected manifest entry"
311
312 def test_empty_manifest_detected_at_read(self, tmp_path: pathlib.Path) -> None:
313 """A snapshot with an empty manifest (should have files) is detected at read time."""
314 from muse.core.snapshots import (
315 read_snapshot,
316 write_snapshot,
317 )
318
319 repo = _make_repo(tmp_path)
320 good = _good_snapshot({
321 "src/main.py": fake_id("main-blob"),
322 "src/utils.py": fake_id("utils-blob"),
323 })
324 _write_corrupt_snapshot(repo, good, {}) # manifest wiped
325
326 write_snapshot(repo, good) # skips
327 result = read_snapshot(repo, good.snapshot_id)
328 assert result is None, "read_snapshot must detect empty manifest (hash mismatch)"
329
330 def test_missing_manifest_entry_detected_at_read(self, tmp_path: pathlib.Path) -> None:
331 """A snapshot with a missing file entry is detected at read time."""
332 from muse.core.snapshots import (
333 read_snapshot,
334 write_snapshot,
335 )
336
337 repo = _make_repo(tmp_path)
338 main_blob = fake_id("main-blob")
339 good = _good_snapshot({
340 "src/main.py": main_blob,
341 "src/utils.py": fake_id("utils-blob"),
342 })
343 _write_corrupt_snapshot(repo, good, {"src/main.py": main_blob}) # utils.py missing
344
345 write_snapshot(repo, good) # skips
346 result = read_snapshot(repo, good.snapshot_id)
347 assert result is None, "read_snapshot must detect missing manifest entry"
348
349
350 # =============================================================================
351 # 3. DATA INTEGRITY β€” full commit β†’ snapshot chain
352 # =============================================================================
353
354
355 class TestCommitSnapshotChain:
356
357 def test_clean_commit_and_snapshot_both_readable(self, tmp_path: pathlib.Path) -> None:
358 """Clean commit and snapshot written correctly are both readable."""
359 from muse.core.commits import (
360 read_commit,
361 write_commit,
362 )
363 from muse.core.snapshots import (
364 read_snapshot,
365 write_snapshot,
366 )
367
368 repo = _make_repo(tmp_path)
369 good_snap = _good_snapshot({"src/main.py": fake_id("main-blob")})
370 good_commit = _good_commit(snapshot_id=good_snap.snapshot_id)
371
372 write_snapshot(repo, good_snap)
373 write_commit(repo, good_commit)
374
375 commit = read_commit(repo, good_commit.commit_id)
376 assert commit is not None
377 snap = read_snapshot(repo, commit.snapshot_id)
378 assert snap is not None
379
380 def test_parent_chain_corrupt_middle_unreadable(self, tmp_path: pathlib.Path) -> None:
381 """A→B→C chain: corrupt B's object → B unreadable; A and C still readable."""
382 from muse.core.commits import (
383 read_commit,
384 write_commit,
385 )
386
387 repo = _make_repo(tmp_path)
388 commit_a = _good_commit(message="commit A", ts=_ts(2022))
389 commit_b = _good_commit(message="commit B", parent_commit_id=commit_a.commit_id, ts=_ts(2023))
390 commit_c = _good_commit(message="commit C", parent_commit_id=commit_b.commit_id, ts=_ts(2024))
391
392 write_commit(repo, commit_a)
393 write_commit(repo, commit_b)
394 write_commit(repo, commit_c)
395
396 # Corrupt B by overwriting its object with a bad payload
397 _write_corrupt_commit(repo, commit_b, {"snapshot_id": fake_id("wrong-snap")})
398
399 assert read_commit(repo, commit_a.commit_id) is not None, "A must be readable"
400 assert read_commit(repo, commit_b.commit_id) is None, "B must be unreadable after corruption"
401 assert read_commit(repo, commit_c.commit_id) is not None, "C must be readable"
402
403 def test_corrupting_one_snapshot_does_not_affect_siblings(self, tmp_path: pathlib.Path) -> None:
404 """Corrupting one snapshot leaves sibling snapshots readable."""
405 from muse.core.snapshots import (
406 read_snapshot,
407 write_snapshot,
408 )
409
410 repo = _make_repo(tmp_path)
411 snap_a = _good_snapshot({"a.py": fake_id("a-blob")})
412 snap_b = _good_snapshot({"b.py": fake_id("b-blob")})
413 snap_c = _good_snapshot({"c.py": fake_id("c-blob")})
414
415 write_snapshot(repo, snap_a)
416 write_snapshot(repo, snap_b)
417 write_snapshot(repo, snap_c)
418
419 _write_corrupt_snapshot(repo, snap_b, {"b.py": fake_id("wrong-blob")})
420
421 assert read_snapshot(repo, snap_a.snapshot_id) is not None, "snap_a must be readable"
422 assert read_snapshot(repo, snap_b.snapshot_id) is None, "snap_b must be unreadable after corruption"
423 assert read_snapshot(repo, snap_c.snapshot_id) is not None, "snap_c must be readable"
424
425
426 # =============================================================================
427 # 4. SECURITY β€” corrupt fields cannot forge content
428 # =============================================================================
429
430
431 class TestSecurityCorruptFields:
432
433 def test_corrupt_snapshot_id_in_commit_rejected_at_read(self, tmp_path: pathlib.Path) -> None:
434 """
435 An attacker who corrupts a commit's snapshot_id cannot make Muse read
436 different content β€” the hash mismatch is detected by read_commit.
437 """
438 from muse.core.commits import (
439 read_commit,
440 write_commit,
441 )
442
443 repo = _make_repo(tmp_path)
444 good = _good_commit()
445 attacker_snapshot = fake_id("attacker-snapshot")
446
447 _write_corrupt_commit(repo, good, {"snapshot_id": attacker_snapshot})
448
449 write_commit(repo, good) # skips β€” object exists
450 result = read_commit(repo, good.commit_id)
451 assert result is None, (
452 "Corrupt commit with attacker's snapshot_id must be rejected at read "
453 "time β€” hash verification must detect the field substitution."
454 )
455
456 def test_injected_manifest_entry_rejected_at_read(self, tmp_path: pathlib.Path) -> None:
457 """
458 An injected file in the manifest (hash mismatch) is rejected at read time.
459 """
460 from muse.core.snapshots import (
461 read_snapshot,
462 write_snapshot,
463 )
464
465 repo = _make_repo(tmp_path)
466 blob = fake_id("main-blob")
467 good = _good_snapshot({"src/main.py": blob})
468 _write_corrupt_snapshot(repo, good, {
469 "src/main.py": blob,
470 "malicious_backdoor.py": fake_id("backdoor-blob"),
471 })
472
473 write_snapshot(repo, good) # skips
474 result = read_snapshot(repo, good.snapshot_id)
475 assert result is None, (
476 "Snapshot with injected manifest entry must be rejected at read time."
477 )
478
479
480 # =============================================================================
481 # 5. STRESS β€” concurrent writes are idempotent
482 # =============================================================================
483
484
485 class TestStressConcurrentWrite:
486
487 def test_concurrent_write_commit_all_idempotent(self, tmp_path: pathlib.Path) -> None:
488 """20 concurrent write_commit calls on a good commit are all idempotent."""
489 from muse.core.commits import (
490 read_commit,
491 write_commit,
492 )
493
494 repo = _make_repo(tmp_path)
495 good = _good_commit()
496
497 def worker() -> None:
498 write_commit(repo, good)
499
500 threads = [threading.Thread(target=worker) for _ in range(20)]
501 for t in threads:
502 t.start()
503 for t in threads:
504 t.join()
505
506 result = read_commit(repo, good.commit_id)
507 assert result is not None, "After 20 concurrent write_commit calls, commit must be readable"
508 assert result.snapshot_id == good.snapshot_id
509
510 def test_concurrent_write_snapshot_all_idempotent(self, tmp_path: pathlib.Path) -> None:
511 """20 concurrent write_snapshot calls on a good snapshot are all idempotent."""
512 from muse.core.snapshots import (
513 read_snapshot,
514 write_snapshot,
515 )
516
517 repo = _make_repo(tmp_path)
518 good = _good_snapshot()
519
520 threads = [threading.Thread(target=lambda: write_snapshot(repo, good)) for _ in range(20)]
521 for t in threads:
522 t.start()
523 for t in threads:
524 t.join()
525
526 result = read_snapshot(repo, good.snapshot_id)
527 assert result is not None, "After 20 concurrent write_snapshot calls, snapshot must be readable"
528
529 def test_50_sequential_commits_all_readable(self, tmp_path: pathlib.Path) -> None:
530 """50 different commits all written and readable."""
531 from muse.core.commits import (
532 read_commit,
533 write_commit,
534 )
535
536 for i in range(50):
537 repo = _make_repo(tmp_path / str(i))
538 good = _good_commit(message=f"commit {i}", ts=f"202{i % 10}-01-01T00:00:00+00:00")
539 write_commit(repo, good)
540 result = read_commit(repo, good.commit_id)
541 assert result is not None, f"commit {i} not readable"
542
543
544 # =============================================================================
545 # 6. REGRESSION β€” normal write paths still work
546 # =============================================================================
547
548
549 class TestRegression:
550
551 def test_write_commit_new_file_works(self, tmp_path: pathlib.Path) -> None:
552 from muse.core.commits import (
553 read_commit,
554 write_commit,
555 )
556
557 repo = _make_repo(tmp_path)
558 good = _good_commit()
559 write_commit(repo, good)
560 assert read_commit(repo, good.commit_id) is not None
561
562 def test_write_snapshot_new_file_works(self, tmp_path: pathlib.Path) -> None:
563 from muse.core.snapshots import (
564 read_snapshot,
565 write_snapshot,
566 )
567
568 repo = _make_repo(tmp_path)
569 good = _good_snapshot()
570 write_snapshot(repo, good)
571 assert read_snapshot(repo, good.snapshot_id) is not None
572
573 def test_write_commit_idempotent_on_clean_file(self, tmp_path: pathlib.Path) -> None:
574 from muse.core.commits import (
575 read_commit,
576 write_commit,
577 )
578
579 repo = _make_repo(tmp_path)
580 good = _good_commit()
581 write_commit(repo, good)
582 for _ in range(10):
583 write_commit(repo, good)
584 assert read_commit(repo, good.commit_id) is not None
585
586 def test_write_snapshot_idempotent_on_clean_file(self, tmp_path: pathlib.Path) -> None:
587 from muse.core.snapshots import (
588 read_snapshot,
589 write_snapshot,
590 )
591
592 repo = _make_repo(tmp_path)
593 good = _good_snapshot()
594 write_snapshot(repo, good)
595 for _ in range(10):
596 write_snapshot(repo, good)
597 assert read_snapshot(repo, good.snapshot_id) is not None