"""Stat-based file hash cache — fast snapshot computation for all domains. Architecture ------------ Every ``plugin.snapshot()`` call must hash every tracked file to detect changes. On a repository with hundreds of files this is the dominant cost of ``muse status``, ``muse diff``, and any command that calls ``snapshot()``. ``StatCache`` eliminates redundant I/O by persisting two classes of hash per file between invocations: Object hash SHA-256 of raw bytes. Used by the content-addressed object store. Recomputed only when ``(ino, mtime, size)`` changes. Dimension hashes Domain-specific semantic hashes. For the code domain these might be the SHA-256 of the AST symbol set, the import set, and so on. For the MIDI domain they might be the hash of parsed note events, tempo map, and harmony analysis. Populated by domain plugins after parsing; consumed by ``diff()`` and ``merge()`` to skip re-parsing unchanged files entirely. An empty ``dimensions`` dict means no semantic hashes are cached yet — this is the baseline state and is always safe. Cache validity -------------- A cache entry is valid when the file's current ``(st_ino, st_mtime, st_size)`` exactly match the stored values. Including the inode number eliminates the "racy Muse" false-cache-hit that the previous ``(mtime, size)``-only check was vulnerable to: an atomically replaced file (e.g. from a build system) gets a fresh inode number even when its mtime and size are unchanged. The cache is **self-healing**: a cache miss always triggers a fresh hash and updates the stored entry. Storage ------- ``.muse/cache/stat.json`` — a versioned JSON document (version 4). Format: .. code-block:: text { "version": 4, "entries": { "muse/core/snapshot.py": { "mtime": 1710000000.123456, "size": 4321, "ino": 12345678, "object_hash": "", "dimensions": { "symbols": "", "imports": "" } } } } Writes are atomic: data is flushed via ``os.fdopen(mkstemp(...))`` then ``os.replace``-d over the target, so a crash mid-write never corrupts the cache. Each concurrent ``muse commit`` process gets its own unique temp file (via ``mkstemp``) so parallel saves do not collide. """ import logging import os import pathlib import tempfile from typing import TypedDict import json as _json from muse.core.types import MUSE_DIR, hash_file, long_id from muse.core.paths import cache_dir as _cache_dir_path, muse_dir as _muse_dir logger = logging.getLogger(__name__) type _DimensionMap = dict[str, str] type _EntryMap = dict[str, FileCacheEntry] _CACHE_VERSION = 4 _CACHE_FILENAME = "stat.json" # Defense in depth: refuse to load a cache file larger than this. # A 75k-file repo produces ~15 MiB of JSON; 256 MiB is a generous ceiling. MAX_CACHE_BYTES: int = 256 * 1024 * 1024 class FileCacheEntry(TypedDict): """Persisted metadata for a single workspace file.""" mtime: float size: int ino: int # inode number — disambiguates atomically replaced files object_hash: str # Domain plugins write semantic hashes here after parsing. # Keys are dimension names ("symbols", "imports", "notes", …). # Empty dict == no dimension hashes cached yet; always safe to return None. dimensions: _DimensionMap class _CacheDoc(TypedDict): """On-disk JSON document shape.""" version: int entries: _EntryMap def _hash_bytes(path: pathlib.Path) -> str: """Return the ``sha256:``-prefixed content ID of *path*'s raw bytes.""" return hash_file(path) def _hash_str(path_str: str) -> str: """String-path convenience wrapper around :func:`~muse.core.types.hash_file`. Used in the hot inner loop of ``walk_workdir`` and plugin snapshot methods where the file path is already a plain string from ``os.walk``. """ return hash_file(pathlib.Path(path_str)) class StatCache: """Shared stat-based hash cache for all domain plugin ``snapshot()`` calls. Typical lifecycle inside a plugin's ``snapshot()``:: cache = StatCache.load(root / MUSE_DIR) for file_path in walk(...): files[rel] = cache.get_object_hash(root, file_path) cache.prune(set(files)) cache.save() The same instance can be passed to ``diff()`` or ``merge()`` logic to retrieve already-computed dimension hashes without re-parsing files. """ def __init__( self, cache_dir: pathlib.Path | None, entries: _EntryMap ) -> None: self._cache_dir = cache_dir self._entries = entries self._dirty = False # ------------------------------------------------------------------ # Construction # ------------------------------------------------------------------ @classmethod def load(cls, muse_dir: pathlib.Path) -> StatCache: """Load the cache from *muse_dir*/cache/stat.json. Validates the version field and every entry's field types on load so a corrupt or future-format file never poisons the cache. Returns a fresh empty cache if the file is absent, unreadable, exceeds ``MAX_CACHE_BYTES``, or version mismatches — never raises. """ cache_dir = muse_dir / "cache" cache_file = cache_dir / _CACHE_FILENAME if not cache_file.is_file(): return cls(cache_dir, {}) try: file_size = cache_file.stat().st_size if file_size > MAX_CACHE_BYTES: logger.critical( "❌ stat_cache %s is %d bytes — exceeds %d MiB limit; " "starting fresh.", cache_file.name, file_size, MAX_CACHE_BYTES // (1024 * 1024), ) return cls(cache_dir, {}) file_bytes = cache_file.read_bytes() # Old binary msgpack files start with a byte > 0x7F — treat as stale. if file_bytes and file_bytes[0] > 0x7F: logger.debug("⚠️ stat_cache is old binary format — starting fresh") return cls(cache_dir, {}) raw = _json.loads(file_bytes.decode("utf-8")) if not (isinstance(raw, dict) and raw.get("version") == _CACHE_VERSION): return cls(cache_dir, {}) raw_entries = raw.get("entries") if not isinstance(raw_entries, dict): return cls(cache_dir, {}) entries: _EntryMap = {} for rel, ev in raw_entries.items(): if not isinstance(rel, str) or not isinstance(ev, dict): continue mtime = ev.get("mtime") size = ev.get("size") ino = ev.get("ino") obj_hash = ev.get("object_hash") dims = ev.get("dimensions") if not ( isinstance(mtime, (int, float)) and isinstance(size, int) and isinstance(ino, int) and isinstance(obj_hash, str) and isinstance(dims, dict) ): continue entries[rel] = FileCacheEntry( mtime=float(mtime), size=size, ino=ino, object_hash=obj_hash, dimensions={str(k): str(v) for k, v in dims.items()}, ) return cls(cache_dir, entries) except Exception: logger.debug("⚠️ stat_cache unreadable — starting fresh") return cls(cache_dir, {}) @classmethod def empty(cls) -> StatCache: """Return a no-op cache for contexts without a ``.muse`` directory.""" return cls(None, {}) # ------------------------------------------------------------------ # Object hash — raw-bytes SHA-256 # ------------------------------------------------------------------ def get_cached( self, rel: str, abs_path_str: str, mtime: float, size: int, ino: int ) -> str: """Fast inner-loop hash lookup with pre-computed stat values. Callers that already have ``(mtime, size, ino)`` from an ``os.stat`` or ``os.walk`` call should use this method to avoid a redundant ``stat()`` syscall inside :meth:`get_object_hash`. The inode number (``ino``) is included in the cache key in addition to ``mtime`` and ``size`` to eliminate false cache hits when a file is atomically replaced (e.g. by a build system or ``muse checkout``) with content of identical size. An atomically replaced file always gets a new inode, so the cache correctly invalidates. Args: rel: Workspace-relative POSIX path (cache key). abs_path_str: Absolute file path as a plain string — avoids constructing a ``pathlib.Path`` in the hot loop. mtime: ``st_mtime`` from the caller's stat result. size: ``st_size`` from the caller's stat result. ino: ``st_ino`` from the caller's stat result. Returns: 64-character lowercase hex SHA-256 digest. """ entry = self._entries.get(rel) if ( entry is not None and entry["ino"] == ino and entry["mtime"] == mtime and entry["size"] == size ): cached = entry["object_hash"] # Normalize legacy bare-hex cache entries to the canonical format. return long_id(cached) obj_hash = _hash_str(abs_path_str) self._entries[rel] = FileCacheEntry( mtime=mtime, size=size, ino=ino, object_hash=obj_hash, dimensions={}, ) self._dirty = True return obj_hash def get_object_hash(self, root: pathlib.Path, file_path: pathlib.Path) -> str: """Return the SHA-256 of *file_path*, using the cache when valid. Convenience wrapper around :meth:`get_cached` for callers that work with ``pathlib.Path`` objects. The hot inner loops of ``walk_workdir`` and plugin snapshot methods call :meth:`get_cached` directly to skip pathlib overhead. Args: root: Repository root — used to compute the workspace-relative POSIX key. file_path: Absolute path to the file. Returns: 64-character lowercase hex SHA-256 digest. """ rel = file_path.relative_to(root).as_posix() st = file_path.stat() return self.get_cached(rel, str(file_path), st.st_mtime, st.st_size, st.st_ino) # ------------------------------------------------------------------ # Dimension hashes — domain-specific semantic hashes # ------------------------------------------------------------------ def get_dimension( self, root: pathlib.Path, file_path: pathlib.Path, dimension: str, ) -> str | None: """Return a cached dimension hash, or ``None`` if not yet computed. Callers must verify that the entry is still valid by checking that the object hash hasn't changed (i.e. call ``get_object_hash`` first to ensure the entry is fresh). Args: root: Repository root. file_path: Absolute path to the file. dimension: Dimension name, e.g. ``"symbols"`` or ``"notes"``. Returns: Cached hash string, or ``None`` if absent. """ rel = file_path.relative_to(root).as_posix() entry = self._entries.get(rel) if entry is None: return None return entry["dimensions"].get(dimension) def set_dimension( self, root: pathlib.Path, file_path: pathlib.Path, dimension: str, hash_value: str, ) -> None: """Store a semantic hash for a specific dimension of *file_path*. Should be called by domain plugins after parsing a file whose object hash triggered a cache miss. Silently ignored if the file has no entry (which should not happen in normal operation). Args: root: Repository root. file_path: Absolute path to the file. dimension: Dimension name, e.g. ``"symbols"``. hash_value: Hash string to store. """ rel = file_path.relative_to(root).as_posix() entry = self._entries.get(rel) if entry is None: return entry["dimensions"][dimension] = hash_value self._dirty = True # ------------------------------------------------------------------ # Lifecycle helpers # ------------------------------------------------------------------ def prune(self, known_paths: set[str]) -> None: """Remove entries for paths no longer present in the working tree. Call this after a full directory walk, passing the set of workspace-relative POSIX paths that were found. Keeps the cache lean by evicting stale entries for deleted files. Args: known_paths: Set of rel-posix paths observed during the walk. """ stale = set(self._entries) - known_paths if stale: for k in stale: del self._entries[k] self._dirty = True def save(self) -> None: """Atomically persist the cache to disk if it has changed. Uses ``mkstemp`` for a process-unique temp file (no concurrent-save collisions), ``fsync`` for durability, then ``os.replace`` for atomicity. Silently skips when there is no ``.muse`` directory (e.g. in-memory unit tests). """ if not self._dirty or self._cache_dir is None: return self._cache_dir.mkdir(parents=True, exist_ok=True) doc = _CacheDoc(version=_CACHE_VERSION, entries=self._entries) payload = _json.dumps(doc, ensure_ascii=False, separators=(",", ":")).encode("utf-8") cache_file = self._cache_dir / _CACHE_FILENAME fd, tmp_path = tempfile.mkstemp( dir=self._cache_dir, prefix=".stat_cache_", suffix=".tmp" ) try: with os.fdopen(fd, "wb") as fh: fh.write(payload) fh.flush() os.fsync(fh.fileno()) except Exception: try: os.unlink(tmp_path) except OSError: pass raise os.replace(tmp_path, cache_file) self._dirty = False logger.debug("✅ stat_cache saved (%d entries)", len(self._entries)) def load_cache(root: pathlib.Path) -> StatCache: """Convenience loader: return a ``StatCache`` for a repository root. Returns ``StatCache.empty()`` when *root* has no ``.muse`` directory so callers never need to guard against a missing repo. Args: root: Repository root (the directory that contains ``.muse/``). Returns: A ``StatCache`` instance ready for use. """ muse_dir = root / MUSE_DIR if muse_dir.is_dir(): return StatCache.load(muse_dir) return StatCache.empty()