"""TDD tests for StatCache integration into symbols_for_snapshot. Root cause ---------- ``symbols_for_snapshot(workdir=root)`` always calls ``disk_path.read_bytes()`` for every Python file to compute the SHA-256 cache key, even when the file hasn't changed since the last run. On the muse repo (~400 files) this costs ~9,700 ms of pure disk I/O every single invocation. Fix --- Accept a ``stat_cache: StatCache | None`` parameter. On a stat-cache hit (``ino + mtime + size`` match) the SHA-256 is already known — skip ``read_bytes()`` entirely. Only when the SymbolCache also misses do we actually read the file. Coverage -------- - ``symbols_for_snapshot`` accepts ``stat_cache=`` keyword argument. - On stat-cache hit + symbol-cache hit: ``read_bytes()`` is never called. - On stat-cache hit + symbol-cache miss: file is read once (to parse). - On stat-cache miss: file is read (to hash + parse if needed). - Stat cache is populated after a workdir call. - Results are identical whether stat_cache is supplied or not. - ``stat_cache`` is ignored when ``workdir=None`` (committed-blob path). """ from __future__ import annotations import hashlib import pathlib from unittest.mock import patch, MagicMock import pytest from muse.core.types import blob_id from muse.core.object_store import write_object from muse.core.stat_cache import StatCache from muse.core.symbol_cache import SymbolCache from muse.plugins.code._query import symbols_for_snapshot from muse.core.paths import muse_dir # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- _PY_SRC = b"""\ def compute(x: int) -> int: return x * 2 def helper() -> int: return 42 """ _PY_SRC_V2 = b"""\ def compute(x: int, y: int = 0) -> int: return x * 2 + y def helper() -> int: return 99 """ def _make_repo(tmp_path: pathlib.Path, content: bytes = _PY_SRC) -> tuple[pathlib.Path, dict]: """Write a .muse repo with one Python file; return (root, manifest).""" dot_muse = muse_dir(tmp_path) dot_muse.mkdir() oid = blob_id(content) write_object(tmp_path, oid, content) (tmp_path / "billing.py").write_bytes(content) return tmp_path, {"billing.py": oid} # --------------------------------------------------------------------------- # 1. symbols_for_snapshot accepts stat_cache= keyword # --------------------------------------------------------------------------- class TestAcceptsStatCache: def test_accepts_stat_cache_none(self, tmp_path: pathlib.Path) -> None: root, manifest = _make_repo(tmp_path) result = symbols_for_snapshot(root, manifest, workdir=root, stat_cache=None) assert "billing.py" in result def test_accepts_stat_cache_instance(self, tmp_path: pathlib.Path) -> None: root, manifest = _make_repo(tmp_path) sc = StatCache.empty() result = symbols_for_snapshot(root, manifest, workdir=root, stat_cache=sc) assert "billing.py" in result def test_result_unchanged_with_or_without_stat_cache( self, tmp_path: pathlib.Path ) -> None: root, manifest = _make_repo(tmp_path) r1 = symbols_for_snapshot(root, manifest, workdir=root) r2 = symbols_for_snapshot(root, manifest, workdir=root, stat_cache=StatCache.empty()) assert set(r1.get("billing.py", {})) == set(r2.get("billing.py", {})) # --------------------------------------------------------------------------- # 2. Stat-cache hit + symbol-cache hit → read_bytes never called # --------------------------------------------------------------------------- class TestStatCacheHitSkipsRead: def test_warm_stat_and_symbol_cache_skips_read_bytes( self, tmp_path: pathlib.Path ) -> None: """Both caches warm → file bytes never read.""" root, manifest = _make_repo(tmp_path) # Warm both caches with a cold run. sym_cache = SymbolCache.load(muse_dir(root)) stat_cache = StatCache.load(muse_dir(root)) symbols_for_snapshot( root, manifest, workdir=root, cache=sym_cache, stat_cache=stat_cache ) sym_cache.save() stat_cache.save() # Reload from disk — fully warm. sym_cache2 = SymbolCache.load(muse_dir(root)) stat_cache2 = StatCache.load(muse_dir(root)) read_call_count = [] original_read_bytes = pathlib.Path.read_bytes def counting_read_bytes(self_path: pathlib.Path) -> bytes: if self_path.suffix == ".py": read_call_count.append(str(self_path)) return original_read_bytes(self_path) with patch.object(pathlib.Path, "read_bytes", counting_read_bytes): symbols_for_snapshot( root, manifest, workdir=root, cache=sym_cache2, stat_cache=stat_cache2 ) assert read_call_count == [], ( f"read_bytes called on warm cache for: {read_call_count}" ) def test_stat_cache_hit_symbol_cache_miss_reads_once( self, tmp_path: pathlib.Path ) -> None: """Stat-cache hit but cold symbol cache → file read exactly once.""" root, manifest = _make_repo(tmp_path) # Warm only the stat cache. stat_cache = StatCache.load(muse_dir(root)) symbols_for_snapshot(root, manifest, workdir=root, stat_cache=stat_cache) stat_cache.save() stat_cache2 = StatCache.load(muse_dir(root)) cold_sym_cache = SymbolCache.empty() read_call_count = [] original_read_bytes = pathlib.Path.read_bytes def counting_read_bytes(self_path: pathlib.Path) -> bytes: if self_path.suffix == ".py": read_call_count.append(str(self_path)) return original_read_bytes(self_path) with patch.object(pathlib.Path, "read_bytes", counting_read_bytes): symbols_for_snapshot( root, manifest, workdir=root, cache=cold_sym_cache, stat_cache=stat_cache2, ) assert len(read_call_count) == 1, ( f"Expected exactly 1 read on stat-hit/sym-miss, got {read_call_count}" ) # --------------------------------------------------------------------------- # 3. Stat cache is populated after a workdir call # --------------------------------------------------------------------------- class TestStatCachePopulated: def test_stat_cache_has_entry_after_workdir_call( self, tmp_path: pathlib.Path ) -> None: root, manifest = _make_repo(tmp_path) sc = StatCache.load(muse_dir(root)) symbols_for_snapshot(root, manifest, workdir=root, stat_cache=sc) sc.save() sc2 = StatCache.load(muse_dir(root)) # billing.py must be in the cache after the workdir call. obj_hash = sc2.get_object_hash(root, root / "billing.py") assert obj_hash == blob_id(_PY_SRC), ( f"Stat cache returned wrong hash: {obj_hash}" ) def test_stat_cache_file_created_on_disk(self, tmp_path: pathlib.Path) -> None: root, manifest = _make_repo(tmp_path) sc = StatCache.load(muse_dir(root)) symbols_for_snapshot(root, manifest, workdir=root, stat_cache=sc) sc.save() assert (muse_dir(root) / "cache" / "stat.json").exists() # --------------------------------------------------------------------------- # 4. stat_cache= ignored when workdir=None (committed-blob path unchanged) # --------------------------------------------------------------------------- class TestStatCacheIgnoredWithoutWorkdir: def test_no_read_bytes_called_for_committed_blobs( self, tmp_path: pathlib.Path ) -> None: """Committed path reads from object store, not disk — stat_cache irrelevant.""" root, manifest = _make_repo(tmp_path) sc = StatCache.empty() # Should not raise and should return symbols. result = symbols_for_snapshot(root, manifest, stat_cache=sc) assert "billing.py" in result # --------------------------------------------------------------------------- # 5. Changed file invalidates stat cache → re-read # --------------------------------------------------------------------------- class TestStatCacheInvalidation: def test_edited_file_triggers_reread(self, tmp_path: pathlib.Path) -> None: """After editing a file, stat cache miss → file is re-read.""" root, manifest = _make_repo(tmp_path) # Warm stat cache with v1. sc = StatCache.load(muse_dir(root)) r1 = symbols_for_snapshot(root, manifest, workdir=root, stat_cache=sc) sc.save() # Edit file on disk (v2 — different content, new mtime). (root / "billing.py").write_bytes(_PY_SRC_V2) sc2 = StatCache.load(muse_dir(root)) r2 = symbols_for_snapshot(root, manifest, workdir=root, stat_cache=sc2) # v2 has different signatures → symbol set differs. syms1 = set(r1.get("billing.py", {})) syms2 = set(r2.get("billing.py", {})) # Both have 'compute' and 'helper' but content_id differs — result # should still be parseable (regression: must not crash or return stale). assert "billing.py" in r2 assert any("compute" in addr for addr in syms2)