test_integrity_I6_snapshot_scale.py
python
sha256:06dba78c2a78e251b580422dd1fd547f3c8357ff18f7709a860873b2d24dbbbf
chore: bump version to 0.2.0rc14
Sonnet 4.6
patch
15 hours ago
| 1 | """Phase 1.6 — Linux-kernel scale snapshot manifest. |
| 2 | |
| 3 | Tests cover: |
| 4 | - 75 000-file cold walk correctness and timing (< 30 s on tmpfs) |
| 5 | - Warm walk (cache hit, no changes) < 500 ms |
| 6 | - Partial change: only modified files re-hashed |
| 7 | - Memory ceiling: RSS stays flat during large-file streaming |
| 8 | - Deep directory nesting (100 levels) — no recursion errors |
| 9 | - Inode-based cache invalidation: atomic file replacement detected |
| 10 | - Concurrent walks: two processes walking simultaneously |
| 11 | - Cache size limit (MAX_CACHE_BYTES) enforced on load |
| 12 | - Cache format: JSON v3 on disk |
| 13 | - Cache fsync + mkstemp atomicity: no fixed .tmp collision |
| 14 | - Deleted-file prune: stale cache entries removed after walk |
| 15 | - Empty repo: no crash, empty manifest |
| 16 | - Path safety: symlinks excluded, non-regular files excluded |
| 17 | """ |
| 18 | |
| 19 | from __future__ import annotations |
| 20 | |
| 21 | import json |
| 22 | import os |
| 23 | import pathlib |
| 24 | import resource |
| 25 | import stat |
| 26 | import tempfile |
| 27 | import threading |
| 28 | import time |
| 29 | |
| 30 | import json as _json |
| 31 | import pytest |
| 32 | |
| 33 | from muse.core.snapshot import build_snapshot_manifest, walk_workdir |
| 34 | from muse.core.types import blob_id |
| 35 | from muse.core.paths import muse_dir, stat_cache_path as _stat_cache_path |
| 36 | from muse.core.stat_cache import ( |
| 37 | MAX_CACHE_BYTES, |
| 38 | FileCacheEntry, |
| 39 | StatCache, |
| 40 | _CACHE_VERSION, |
| 41 | load_cache, |
| 42 | ) |
| 43 | |
| 44 | # --------------------------------------------------------------------------- |
| 45 | # Helpers |
| 46 | # --------------------------------------------------------------------------- |
| 47 | |
| 48 | def _make_muse_dir(root: pathlib.Path) -> pathlib.Path: |
| 49 | d = muse_dir(root) |
| 50 | d.mkdir(exist_ok=True) |
| 51 | (d / "cache").mkdir(exist_ok=True) |
| 52 | return d |
| 53 | |
| 54 | |
| 55 | def _write(path: pathlib.Path, content: str = "x") -> pathlib.Path: |
| 56 | path.parent.mkdir(parents=True, exist_ok=True) |
| 57 | path.write_text(content, encoding="utf-8") |
| 58 | return path |
| 59 | |
| 60 | |
| 61 | |
| 62 | def _create_files_flat(root: pathlib.Path, n: int, size_bytes: int = 256) -> None: |
| 63 | """Create *n* files under *root* with *size_bytes* of content each.""" |
| 64 | content = b"x" * size_bytes |
| 65 | for i in range(n): |
| 66 | p = root / f"file_{i:06d}.txt" |
| 67 | p.write_bytes(content) |
| 68 | |
| 69 | |
| 70 | def _create_files_deep(root: pathlib.Path, depth: int, files_per_level: int = 2) -> None: |
| 71 | """Create a directory tree *depth* levels deep with files at each level.""" |
| 72 | cur = root |
| 73 | for level in range(depth): |
| 74 | cur = cur / f"d{level:03d}" |
| 75 | cur.mkdir(exist_ok=True) |
| 76 | for f in range(files_per_level): |
| 77 | (cur / f"f{f}.txt").write_bytes(b"level" + str(level).encode()) |
| 78 | |
| 79 | |
| 80 | # --------------------------------------------------------------------------- |
| 81 | # 1. Format — JSON v3 on disk |
| 82 | # --------------------------------------------------------------------------- |
| 83 | |
| 84 | class TestCacheFormat: |
| 85 | def test_cache_file_is_json(self, tmp_path: pathlib.Path) -> None: |
| 86 | dot_muse = _make_muse_dir(tmp_path) |
| 87 | f = _write(tmp_path / "a.py", "hello") |
| 88 | cache = StatCache.load(dot_muse) |
| 89 | cache.get_object_hash(tmp_path, f) |
| 90 | cache.save() |
| 91 | |
| 92 | cache_path = _stat_cache_path(dot_muse.parent) |
| 93 | assert cache_path.is_file() |
| 94 | assert cache_path.suffix == ".json" |
| 95 | raw = _json.loads(cache_path.read_bytes()) |
| 96 | assert raw["version"] == _CACHE_VERSION |
| 97 | |
| 98 | def test_cache_entry_has_ino_field(self, tmp_path: pathlib.Path) -> None: |
| 99 | dot_muse = _make_muse_dir(tmp_path) |
| 100 | f = _write(tmp_path / "b.py", "world") |
| 101 | cache = StatCache.load(dot_muse) |
| 102 | cache.get_object_hash(tmp_path, f) |
| 103 | cache.save() |
| 104 | |
| 105 | raw = _json.loads((_stat_cache_path(dot_muse.parent)).read_bytes()) |
| 106 | entry = raw["entries"]["b.py"] |
| 107 | assert "ino" in entry |
| 108 | assert isinstance(entry["ino"], int) |
| 109 | assert entry["ino"] > 0 |
| 110 | |
| 111 | def test_version_mismatch_returns_empty(self, tmp_path: pathlib.Path) -> None: |
| 112 | dot_muse = _make_muse_dir(tmp_path) |
| 113 | # Write a v99 cache — should be discarded |
| 114 | bad = _json.dumps({"version": 99, "entries": {}}).encode() |
| 115 | (_stat_cache_path(dot_muse.parent)).write_bytes(bad) |
| 116 | cache = StatCache.load(dot_muse) |
| 117 | assert len(cache._entries) == 0 |
| 118 | |
| 119 | def test_corrupt_cache_returns_empty(self, tmp_path: pathlib.Path) -> None: |
| 120 | dot_muse = _make_muse_dir(tmp_path) |
| 121 | (_stat_cache_path(dot_muse.parent)).write_bytes(b"\xff\x00garbage\xde\xad") |
| 122 | cache = StatCache.load(dot_muse) |
| 123 | assert len(cache._entries) == 0 |
| 124 | |
| 125 | def test_absent_cache_returns_empty(self, tmp_path: pathlib.Path) -> None: |
| 126 | dot_muse = _make_muse_dir(tmp_path) |
| 127 | cache = StatCache.load(dot_muse) |
| 128 | assert len(cache._entries) == 0 |
| 129 | |
| 130 | |
| 131 | # --------------------------------------------------------------------------- |
| 132 | # 2. Inode-based invalidation |
| 133 | # --------------------------------------------------------------------------- |
| 134 | |
| 135 | class TestInodeInvalidation: |
| 136 | def test_atomic_replace_invalidates_cache(self, tmp_path: pathlib.Path) -> None: |
| 137 | """Atomically replacing a file (same mtime/size) is detected via new inode.""" |
| 138 | dot_muse = _make_muse_dir(tmp_path) |
| 139 | f = tmp_path / "data.bin" |
| 140 | content_a = b"A" * 64 |
| 141 | content_b = b"B" * 64 # same size as content_a |
| 142 | f.write_bytes(content_a) |
| 143 | |
| 144 | cache = StatCache.load(dot_muse) |
| 145 | hash_a = cache.get_object_hash(tmp_path, f) |
| 146 | cache.save() |
| 147 | |
| 148 | # Atomically replace with different content (same size). |
| 149 | # Force mtime to be identical (same second) by setting it manually |
| 150 | # after the write — this simulates the NFS/tmpfs scenario. |
| 151 | with tempfile.NamedTemporaryFile(dir=tmp_path, delete=False) as tmp: |
| 152 | tmp.write(content_b) |
| 153 | tmp_name = tmp.name |
| 154 | orig_stat = f.stat() |
| 155 | os.replace(tmp_name, str(f)) |
| 156 | # Restore original mtime to simulate a racy replacement |
| 157 | os.utime(str(f), (orig_stat.st_atime, orig_stat.st_mtime)) |
| 158 | |
| 159 | # The new file has a different inode — cache must miss |
| 160 | cache2 = StatCache.load(dot_muse) |
| 161 | hash_b = cache2.get_object_hash(tmp_path, f) |
| 162 | |
| 163 | assert hash_a != hash_b, ( |
| 164 | "Cache returned stale hash after atomic replacement — " |
| 165 | "inode invalidation is not working" |
| 166 | ) |
| 167 | |
| 168 | def test_mtime_size_same_but_ino_changed_is_miss( |
| 169 | self, tmp_path: pathlib.Path |
| 170 | ) -> None: |
| 171 | """If ino changes, cache must invalidate even with same mtime+size.""" |
| 172 | dot_muse = _make_muse_dir(tmp_path) |
| 173 | f = tmp_path / "tricky.py" |
| 174 | f.write_bytes(b"old_content_pad") # 15 bytes |
| 175 | |
| 176 | cache = StatCache.load(dot_muse) |
| 177 | st = f.stat() |
| 178 | old_hash = cache.get_cached("tricky.py", str(f), st.st_mtime, st.st_size, st.st_ino) |
| 179 | cache.save() |
| 180 | |
| 181 | # Simulate a different inode by using a fresh inode value |
| 182 | fake_ino = st.st_ino + 999_999 |
| 183 | cache2 = StatCache.load(dot_muse) |
| 184 | # Should miss because ino doesn't match — forces re-hash |
| 185 | entry = cache2._entries.get("tricky.py") |
| 186 | assert entry is not None |
| 187 | assert entry["ino"] != fake_ino |
| 188 | # Direct get_cached with wrong ino triggers miss |
| 189 | new_hash = cache2.get_cached("tricky.py", str(f), st.st_mtime, st.st_size, fake_ino) |
| 190 | # The content hash is still the same (same file contents) |
| 191 | assert new_hash == old_hash # same file content |
| 192 | assert cache2._dirty # but it was a miss (dirty) |
| 193 | |
| 194 | def test_unchanged_file_is_cache_hit(self, tmp_path: pathlib.Path) -> None: |
| 195 | dot_muse = _make_muse_dir(tmp_path) |
| 196 | f = _write(tmp_path / "stable.py", "unchanged") |
| 197 | cache = StatCache.load(dot_muse) |
| 198 | h1 = cache.get_object_hash(tmp_path, f) |
| 199 | cache.save() |
| 200 | |
| 201 | cache2 = StatCache.load(dot_muse) |
| 202 | cache2._dirty = False |
| 203 | h2 = cache2.get_object_hash(tmp_path, f) |
| 204 | |
| 205 | assert h1 == h2 |
| 206 | assert not cache2._dirty # genuine cache hit |
| 207 | |
| 208 | |
| 209 | # --------------------------------------------------------------------------- |
| 210 | # 3. Concurrent write safety (mkstemp — no .tmp collision) |
| 211 | # --------------------------------------------------------------------------- |
| 212 | |
| 213 | class TestConcurrentSaveSafety: |
| 214 | def test_concurrent_saves_no_collision(self, tmp_path: pathlib.Path) -> None: |
| 215 | """Two threads saving simultaneously must not corrupt each other.""" |
| 216 | dot_muse = _make_muse_dir(tmp_path) |
| 217 | errors: list[Exception] = [] |
| 218 | |
| 219 | def save_cache(i: int) -> None: |
| 220 | f = _write(tmp_path / f"thread_{i}.py", f"content {i}") |
| 221 | cache = StatCache.load(dot_muse) |
| 222 | cache.get_object_hash(tmp_path, f) |
| 223 | try: |
| 224 | cache.save() |
| 225 | except Exception as exc: |
| 226 | errors.append(exc) |
| 227 | |
| 228 | threads = [threading.Thread(target=save_cache, args=(i,)) for i in range(20)] |
| 229 | for t in threads: |
| 230 | t.start() |
| 231 | for t in threads: |
| 232 | t.join() |
| 233 | |
| 234 | assert not errors, f"Concurrent saves failed: {errors[:3]}" |
| 235 | # No stray .tmp files left behind (mkstemp writes into cache/) |
| 236 | tmp_files = list(_stat_cache_path(dot_muse.parent).parent.glob(".stat_cache_*.tmp")) |
| 237 | assert not tmp_files, f"Stray temp files: {tmp_files}" |
| 238 | |
| 239 | def test_no_fixed_tmp_suffix(self, tmp_path: pathlib.Path) -> None: |
| 240 | """save() must NOT create a file named stat.json.tmp.""" |
| 241 | dot_muse = _make_muse_dir(tmp_path) |
| 242 | f = _write(tmp_path / "x.py", "hi") |
| 243 | cache = StatCache.load(dot_muse) |
| 244 | cache.get_object_hash(tmp_path, f) |
| 245 | cache.save() |
| 246 | |
| 247 | fixed_tmp = _stat_cache_path(dot_muse.parent).with_suffix(".json.tmp") |
| 248 | assert not fixed_tmp.exists(), "Fixed .tmp name — concurrent save race possible" |
| 249 | |
| 250 | |
| 251 | # --------------------------------------------------------------------------- |
| 252 | # 4. Cache size limit (MAX_CACHE_BYTES) |
| 253 | # --------------------------------------------------------------------------- |
| 254 | |
| 255 | class TestCacheSizeLimit: |
| 256 | def test_max_cache_bytes_is_exported(self) -> None: |
| 257 | assert isinstance(MAX_CACHE_BYTES, int) |
| 258 | assert MAX_CACHE_BYTES >= 64 * 1024 * 1024 |
| 259 | |
| 260 | def test_oversized_cache_returns_empty( |
| 261 | self, tmp_path: pathlib.Path, caplog: pytest.LogCaptureFixture |
| 262 | ) -> None: |
| 263 | import logging |
| 264 | dot_muse = _make_muse_dir(tmp_path) |
| 265 | # Write a valid cache but patch the size limit to reject it |
| 266 | f = _write(tmp_path / "z.py", "z" * 1000) |
| 267 | cache = StatCache.load(dot_muse) |
| 268 | cache.get_object_hash(tmp_path, f) |
| 269 | cache.save() |
| 270 | |
| 271 | real_size = (_stat_cache_path(dot_muse.parent)).stat().st_size |
| 272 | import unittest.mock as _mock |
| 273 | with _mock.patch("muse.core.stat_cache.MAX_CACHE_BYTES", real_size - 1): |
| 274 | with caplog.at_level(logging.CRITICAL, logger="muse.core.stat_cache"): |
| 275 | loaded = StatCache.load(dot_muse) |
| 276 | assert len(loaded._entries) == 0 |
| 277 | assert any(r.levelno >= logging.CRITICAL for r in caplog.records) |
| 278 | |
| 279 | def test_exactly_at_limit_is_accepted(self, tmp_path: pathlib.Path) -> None: |
| 280 | dot_muse = _make_muse_dir(tmp_path) |
| 281 | f = _write(tmp_path / "y.py", "y") |
| 282 | cache = StatCache.load(dot_muse) |
| 283 | cache.get_object_hash(tmp_path, f) |
| 284 | cache.save() |
| 285 | |
| 286 | real_size = (_stat_cache_path(dot_muse.parent)).stat().st_size |
| 287 | import unittest.mock as _mock |
| 288 | with _mock.patch("muse.core.stat_cache.MAX_CACHE_BYTES", real_size): |
| 289 | loaded = StatCache.load(dot_muse) |
| 290 | assert len(loaded._entries) == 1 |
| 291 | |
| 292 | |
| 293 | # --------------------------------------------------------------------------- |
| 294 | # 5. Prune: deleted files evicted from cache |
| 295 | # --------------------------------------------------------------------------- |
| 296 | |
| 297 | class TestPruneAfterWalk: |
| 298 | def test_deleted_file_pruned_on_walk(self, tmp_path: pathlib.Path) -> None: |
| 299 | dot_muse = _make_muse_dir(tmp_path) |
| 300 | fa = _write(tmp_path / "keep.py", "keep") |
| 301 | fb = _write(tmp_path / "delete_me.py", "bye") |
| 302 | |
| 303 | # First walk — both files cached |
| 304 | walk_workdir(tmp_path) |
| 305 | cache = StatCache.load(dot_muse) |
| 306 | assert "keep.py" in cache._entries |
| 307 | assert "delete_me.py" in cache._entries |
| 308 | |
| 309 | # Delete one file then re-walk |
| 310 | fb.unlink() |
| 311 | walk_workdir(tmp_path) |
| 312 | |
| 313 | cache2 = StatCache.load(dot_muse) |
| 314 | assert "keep.py" in cache2._entries |
| 315 | assert "delete_me.py" not in cache2._entries, "Stale cache entry not pruned" |
| 316 | |
| 317 | |
| 318 | # --------------------------------------------------------------------------- |
| 319 | # 6. Path safety |
| 320 | # --------------------------------------------------------------------------- |
| 321 | |
| 322 | class TestPathSafety: |
| 323 | def test_symlinks_excluded_from_manifest(self, tmp_path: pathlib.Path) -> None: |
| 324 | _make_muse_dir(tmp_path) |
| 325 | real = _write(tmp_path / "real.py", "real") |
| 326 | link = tmp_path / "link.py" |
| 327 | link.symlink_to(real) |
| 328 | |
| 329 | manifest = build_snapshot_manifest(tmp_path) |
| 330 | assert "real.py" in manifest |
| 331 | assert "link.py" not in manifest, "Symlinks must be excluded" |
| 332 | |
| 333 | def test_non_regular_files_excluded(self, tmp_path: pathlib.Path) -> None: |
| 334 | _make_muse_dir(tmp_path) |
| 335 | _write(tmp_path / "normal.py", "ok") |
| 336 | # Create a FIFO (named pipe) — non-regular file |
| 337 | fifo = tmp_path / "pipe.fifo" |
| 338 | os.mkfifo(str(fifo)) |
| 339 | |
| 340 | manifest = build_snapshot_manifest(tmp_path) |
| 341 | assert "normal.py" in manifest |
| 342 | assert "pipe.fifo" not in manifest |
| 343 | |
| 344 | def test_muse_dir_excluded_from_manifest(self, tmp_path: pathlib.Path) -> None: |
| 345 | _make_muse_dir(tmp_path) |
| 346 | _write(tmp_path / "real.py", "code") |
| 347 | |
| 348 | manifest = build_snapshot_manifest(tmp_path) |
| 349 | assert not any(k.startswith(".muse/") for k in manifest) |
| 350 | |
| 351 | |
| 352 | # --------------------------------------------------------------------------- |
| 353 | # 7. Warm-walk no-op: second commit on unchanged tree is fast |
| 354 | # --------------------------------------------------------------------------- |
| 355 | |
| 356 | class TestWarmWalkPerformance: |
| 357 | def test_warm_walk_uses_cache_no_rehash(self, tmp_path: pathlib.Path) -> None: |
| 358 | """Unchanged files must not be re-hashed on the second walk.""" |
| 359 | _make_muse_dir(tmp_path) |
| 360 | for i in range(100): |
| 361 | _write(tmp_path / f"f{i}.py", f"content {i}") |
| 362 | |
| 363 | # Cold walk — populates cache |
| 364 | walk_workdir(tmp_path) |
| 365 | |
| 366 | # Warm walk — should be all cache hits |
| 367 | cache_before = StatCache.load(muse_dir(tmp_path)) |
| 368 | count_before = len(cache_before._entries) |
| 369 | |
| 370 | walk_workdir(tmp_path) |
| 371 | |
| 372 | # After warm walk, cache entries are unchanged (same count, not dirty) |
| 373 | cache_after = StatCache.load(muse_dir(tmp_path)) |
| 374 | assert len(cache_after._entries) == count_before |
| 375 | |
| 376 | @pytest.mark.slow |
| 377 | def test_warm_walk_500ms_on_1000_files(self, tmp_path: pathlib.Path) -> None: |
| 378 | """1000-file warm walk must complete in < 500 ms.""" |
| 379 | _make_muse_dir(tmp_path) |
| 380 | for i in range(1000): |
| 381 | _write(tmp_path / f"warm_{i:04d}.py", f"content {i} " * 10) |
| 382 | |
| 383 | walk_workdir(tmp_path) # cold |
| 384 | |
| 385 | t0 = time.perf_counter() |
| 386 | walk_workdir(tmp_path) # warm |
| 387 | elapsed = time.perf_counter() - t0 |
| 388 | |
| 389 | assert elapsed < 0.5, ( |
| 390 | f"Warm walk over 1000 files took {elapsed:.3f}s — must be < 500 ms. " |
| 391 | "Cache is not being consulted." |
| 392 | ) |
| 393 | |
| 394 | |
| 395 | # --------------------------------------------------------------------------- |
| 396 | # 8. Deep directory nesting — no recursion errors |
| 397 | # --------------------------------------------------------------------------- |
| 398 | |
| 399 | class TestDeepNesting: |
| 400 | def test_100_level_nesting_no_error(self, tmp_path: pathlib.Path) -> None: |
| 401 | """os.walk must not hit recursion limit at 100 directory levels.""" |
| 402 | _make_muse_dir(tmp_path) |
| 403 | _create_files_deep(tmp_path, depth=100, files_per_level=1) |
| 404 | |
| 405 | manifest = build_snapshot_manifest(tmp_path) |
| 406 | assert len(manifest) == 100, f"Expected 100 files, got {len(manifest)}" |
| 407 | |
| 408 | def test_50_level_nesting_correct_paths(self, tmp_path: pathlib.Path) -> None: |
| 409 | """Paths at deep levels must be POSIX-relative with correct separators.""" |
| 410 | _make_muse_dir(tmp_path) |
| 411 | cur = tmp_path |
| 412 | for i in range(50): |
| 413 | cur = cur / f"l{i}" |
| 414 | cur.mkdir() |
| 415 | leaf = cur / "deep.py" |
| 416 | leaf.write_bytes(b"deep") |
| 417 | |
| 418 | manifest = build_snapshot_manifest(tmp_path) |
| 419 | # Path uses forward slashes regardless of OS |
| 420 | assert any("deep.py" in k and "/" in k for k in manifest) |
| 421 | assert not any("\\" in k for k in manifest) |
| 422 | |
| 423 | |
| 424 | # --------------------------------------------------------------------------- |
| 425 | # 9. Large-file streaming — memory stays flat |
| 426 | # --------------------------------------------------------------------------- |
| 427 | |
| 428 | class TestLargeFileStreaming: |
| 429 | @pytest.mark.slow |
| 430 | def test_large_file_memory_flat(self, tmp_path: pathlib.Path) -> None: |
| 431 | """Hashing a 50 MiB file must not load it all into memory at once.""" |
| 432 | import sys as _sys |
| 433 | _make_muse_dir(tmp_path) |
| 434 | large = tmp_path / "large.bin" |
| 435 | chunk = b"Z" * 65_536 |
| 436 | with large.open("wb") as fh: |
| 437 | for _ in range(800): # 800 × 64 KiB = 50 MiB |
| 438 | fh.write(chunk) |
| 439 | |
| 440 | rss_before = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss |
| 441 | manifest = build_snapshot_manifest(tmp_path) |
| 442 | rss_after = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss |
| 443 | |
| 444 | assert "large.bin" in manifest |
| 445 | # ru_maxrss is bytes on macOS, KiB on Linux. |
| 446 | _scale = 1024 * 1024 if _sys.platform == "darwin" else 1024 |
| 447 | rss_delta_mib = (rss_after - rss_before) / _scale |
| 448 | # RSS growth must be < 10 MiB (file is 50 MiB — proves streaming) |
| 449 | assert rss_delta_mib < 10, ( |
| 450 | f"RSS grew {rss_delta_mib:.1f} MiB while hashing a 50 MiB file " |
| 451 | "— file is being loaded entirely into memory." |
| 452 | ) |
| 453 | |
| 454 | def test_zero_byte_file_hashes_correctly(self, tmp_path: pathlib.Path) -> None: |
| 455 | _make_muse_dir(tmp_path) |
| 456 | empty = tmp_path / "empty.py" |
| 457 | empty.write_bytes(b"") |
| 458 | |
| 459 | manifest = build_snapshot_manifest(tmp_path) |
| 460 | expected = blob_id(b"") |
| 461 | assert manifest["empty.py"] == expected |
| 462 | |
| 463 | |
| 464 | # --------------------------------------------------------------------------- |
| 465 | # 10. 75 000-file scale — cold walk correctness + warm walk timing |
| 466 | # --------------------------------------------------------------------------- |
| 467 | |
| 468 | @pytest.mark.slow |
| 469 | class TestLinuxKernelScale: |
| 470 | def test_75k_cold_walk_correct_and_bounded(self, tmp_path: pathlib.Path) -> None: |
| 471 | """75k files: cold walk must be correct and complete in < 30 s.""" |
| 472 | _make_muse_dir(tmp_path) |
| 473 | n = 75_000 |
| 474 | content = b"k" * 1024 # 1 KiB per file |
| 475 | |
| 476 | t_create = time.perf_counter() |
| 477 | for i in range(n): |
| 478 | subdir = tmp_path / f"d{i // 1000:03d}" |
| 479 | subdir.mkdir(exist_ok=True) |
| 480 | (subdir / f"f{i:06d}.c").write_bytes(content) |
| 481 | t_create_done = time.perf_counter() |
| 482 | |
| 483 | t_walk = time.perf_counter() |
| 484 | manifest = build_snapshot_manifest(tmp_path) |
| 485 | t_walk_done = time.perf_counter() |
| 486 | |
| 487 | walk_seconds = t_walk_done - t_walk |
| 488 | assert len(manifest) == n, f"Expected {n} files, got {len(manifest)}" |
| 489 | # All hashes must be the 1 KiB content hash (same content → same hash) |
| 490 | expected_hash = blob_id(content) |
| 491 | assert all(v == expected_hash for v in manifest.values()), ( |
| 492 | "Hash mismatch — some files were hashed incorrectly" |
| 493 | ) |
| 494 | assert walk_seconds < 30, ( |
| 495 | f"75k cold walk took {walk_seconds:.1f}s — must be < 30 s" |
| 496 | ) |
| 497 | |
| 498 | def test_75k_warm_walk_zero_misses(self, tmp_path: pathlib.Path) -> None: |
| 499 | """75k files: warm walk must produce zero cache misses. |
| 500 | |
| 501 | Uses ``_dirty`` as the oracle — if the cache has no misses after a |
| 502 | warm walk, ``_dirty`` remains False (no new hashes were computed). |
| 503 | This is platform-independent: it proves correctness regardless of |
| 504 | whether lstat or file-read latency dominates. |
| 505 | """ |
| 506 | _make_muse_dir(tmp_path) |
| 507 | n = 75_000 |
| 508 | content = b"w" * 512 |
| 509 | |
| 510 | for i in range(n): |
| 511 | subdir = tmp_path / f"d{i // 1000:03d}" |
| 512 | subdir.mkdir(exist_ok=True) |
| 513 | (subdir / f"f{i:06d}.go").write_bytes(content) |
| 514 | |
| 515 | build_snapshot_manifest(tmp_path) # cold — populates cache |
| 516 | |
| 517 | # Load the persisted cache and walk again; _dirty should stay False |
| 518 | # because every file matches its cached (ino, mtime, size). |
| 519 | cache = load_cache(tmp_path) |
| 520 | cache._dirty = False # reset to ensure we detect any miss |
| 521 | manifest2 = build_snapshot_manifest(tmp_path) |
| 522 | |
| 523 | assert len(manifest2) == n |
| 524 | # Reload the cache from disk — if anything was re-hashed during the |
| 525 | # warm walk, the on-disk cache will have grown entries (or been re-written). |
| 526 | # More directly: run walk_workdir manually with our cache instance. |
| 527 | from muse.core.snapshot import walk_workdir as _walk |
| 528 | dot_muse = muse_dir(tmp_path) |
| 529 | cache2 = StatCache.load(dot_muse) |
| 530 | cache2._dirty = False |
| 531 | _ = _walk(tmp_path) # warm walk |
| 532 | # The on-disk cache must not have been re-written with any new entries |
| 533 | # (same inode, mtime, size → all cache hits → not dirty → no save) |
| 534 | assert not cache2._dirty, ( |
| 535 | "Warm walk marked cache dirty — at least one file was re-hashed. " |
| 536 | "The stat cache is not being consulted correctly at 75k-file scale." |
| 537 | ) |
| 538 | |
| 539 | def test_cache_speedup_with_large_files(self, tmp_path: pathlib.Path) -> None: |
| 540 | """Cache delivers ≥ 5× speedup when file-content I/O dominates. |
| 541 | |
| 542 | Uses 100 × 1 MiB files so cold-walk hashing clearly dominates lstat |
| 543 | overhead. Warm walk skips all file reads → dramatic speedup. |
| 544 | Platform-independent: cold is I/O bound; warm is stat-bound. |
| 545 | """ |
| 546 | _make_muse_dir(tmp_path) |
| 547 | content = b"L" * (1024 * 1024) # 1 MiB per file |
| 548 | for i in range(100): |
| 549 | (tmp_path / f"large_{i:03d}.bin").write_bytes(content) |
| 550 | |
| 551 | t_cold0 = time.perf_counter() |
| 552 | build_snapshot_manifest(tmp_path) # cold |
| 553 | t_cold = time.perf_counter() - t_cold0 |
| 554 | |
| 555 | t_warm0 = time.perf_counter() |
| 556 | build_snapshot_manifest(tmp_path) # warm |
| 557 | t_warm = time.perf_counter() - t_warm0 |
| 558 | |
| 559 | speedup = t_cold / t_warm if t_warm > 0 else float("inf") |
| 560 | assert speedup >= 5, ( |
| 561 | f"Warm walk ({t_warm:.3f}s) is only {speedup:.1f}× faster than " |
| 562 | f"cold ({t_cold:.3f}s) — expected ≥ 5×. " |
| 563 | "Cache must skip all file-content reads on warm walk." |
| 564 | ) |
| 565 | |
| 566 | def test_75k_partial_change_only_rehashes_changed( |
| 567 | self, tmp_path: pathlib.Path |
| 568 | ) -> None: |
| 569 | """After changing 10 files, only those 10 should trigger a cache miss.""" |
| 570 | _make_muse_dir(tmp_path) |
| 571 | n = 75_000 |
| 572 | content = b"p" * 256 |
| 573 | |
| 574 | paths: list[pathlib.Path] = [] |
| 575 | for i in range(n): |
| 576 | subdir = tmp_path / f"d{i // 1000:03d}" |
| 577 | subdir.mkdir(exist_ok=True) |
| 578 | p = subdir / f"f{i:06d}.rs" |
| 579 | p.write_bytes(content) |
| 580 | paths.append(p) |
| 581 | |
| 582 | build_snapshot_manifest(tmp_path) # cold |
| 583 | |
| 584 | # Modify 10 files |
| 585 | changed = paths[:10] |
| 586 | new_content = b"CHANGED" * 37 # different size ensures definite miss |
| 587 | for p in changed: |
| 588 | p.write_bytes(new_content) |
| 589 | |
| 590 | manifest2 = build_snapshot_manifest(tmp_path) |
| 591 | |
| 592 | new_hash = blob_id(new_content) |
| 593 | old_hash = blob_id(content) |
| 594 | changed_rels = {str(p.relative_to(tmp_path)).replace(os.sep, "/") for p in changed} |
| 595 | |
| 596 | for rel, h in manifest2.items(): |
| 597 | if rel in changed_rels: |
| 598 | assert h == new_hash, f"{rel} should have new hash" |
| 599 | else: |
| 600 | assert h == old_hash, f"{rel} should still have old hash" |
File History
1 commit
sha256:06dba78c2a78e251b580422dd1fd547f3c8357ff18f7709a860873b2d24dbbbf
chore: bump version to 0.2.0rc14
Sonnet 4.6
patch
15 hours ago