gabriel / muse public
snapshot.py python
517 lines 20.3 KB
Raw
sha256:84df9126d09aeec0b8f1b908f0b06c10913feec28f3514b382efb1ba6d619385 refactor: rename StructuredMergePlugin to AddressedMergePlu… Sonnet 4.6 minor ⚠ breaking 25 days ago
1 """Pure filesystem snapshot logic for ``muse commit``.
2
3 All functions here are side-effect-free (no DB, no I/O besides reading
4 files under ``workdir``). They are kept separate so they can be
5 unit-tested without a database.
6
7 ID derivation contract (deterministic, no random components):
8
9 object_id = "sha256:" + sha256(file_bytes).hexdigest()
10
11 snapshot_id = "sha256:" + sha256(
12 NUL.join(sorted(f"{path}NUL{strip(oid)}"
13 for path, oid in manifest.items()))
14 ).hexdigest() # strip() removes any leading "sha256:" prefix
15
16 commit_id = "sha256:" + sha256(
17 NUL.join([NUL.join(sorted(strip(p) for p in parent_ids)),
18 strip(snapshot_id), message, committed_at_iso])
19 ).hexdigest() # strip() removes any leading "sha256:" prefix
20
21 All three functions normalize their inputs by stripping any ``sha256:`` prefix
22 before hashing. This makes the IDs stable regardless of whether callers pass
23 canonical ``sha256:<hex>`` or legacy bare-hex strings — the resulting ID is
24 always identical.
25
26 The null byte (\\x00) is used as the field separator because it is:
27 - Illegal in POSIX filenames (preventing separator-injection attacks from
28 crafted file paths).
29 - Absent from SHA-256 hex strings (preventing injection via object IDs).
30 - Absent from ISO-8601 timestamps and typical message text.
31
32 This replaces the previous ``|`` / ``:`` separator scheme which allowed two
33 distinct manifests or commit inputs to produce the same hash if filenames
34 contained those characters.
35
36 Symlinks in the working tree are excluded from snapshots. Following a
37 symlink that points outside state/ would silently commit the contents
38 of arbitrary filesystem paths.
39
40 Exclusion policy
41 ----------------
42 Dotfiles and dot-directories are **tracked by default** — ``.cursorrules``,
43 ``.editorconfig``, ``.eslintrc`` are intentional project configuration that
44 collaborators need. Exclusion is driven entirely by ``.museignore`` plus the
45 built-in secrets blocklist below. The only hard-coded directory skip is
46 ``.muse/`` itself (internal VCS storage) and a performance-only list of
47 directories that are universally noise (``node_modules/``, ``__pycache__/``,
48 ``.venv/`` etc.).
49 """
50
51 import fnmatch
52 import os
53 import pathlib
54 import re
55 import stat as _stat
56
57 import hashlib as _hashlib
58
59 from muse.core.types import Manifest, blob_id, hash_file, load_json_file, split_id
60 from muse.core.paths import repo_json_path as _repo_json_path
61 from muse.core.ignore import is_ignored, load_force_track_paths, load_ignore_config, resolve_patterns
62 from muse.core.stat_cache import load_cache
63
64 # Directories that are always pruned before os.walk descends into them.
65 # These are either internal VCS storage (.muse) or universally-noisy
66 # directories whose contents are never meaningful project source.
67 # Kept as a frozenset for O(1) lookup inside the hot walk loop.
68 _ALWAYS_PRUNE_DIRS: frozenset[str] = frozenset(
69 {
70 ".muse",
71 ".git",
72 "node_modules",
73 "__pycache__",
74 ".venv",
75 "venv",
76 ".tox",
77 ".nox",
78 ".mypy_cache",
79 ".ruff_cache",
80 ".pytest_cache",
81 ".coverage",
82 "htmlcov",
83 "dist",
84 "build",
85 }
86 )
87
88 # Built-in secrets blocklist — applied even when .museignore is absent.
89 # This is the last line of defence: these files must never appear in a
90 # snapshot regardless of what a user configures in .museignore.
91 #
92 # Note: .env.example is intentionally NOT listed here — it is the universal
93 # convention for a safe, credential-free environment template and must be
94 # trackable. We block the real secret files explicitly instead of using a
95 # wildcard that would accidentally catch .env.example.
96 _BUILTIN_SECRET_PATTERNS: list[str] = [
97 ".env",
98 ".env.local",
99 ".env.development",
100 ".env.staging",
101 ".env.production",
102 ".env.prod",
103 ".envrc",
104 "*.pem",
105 "*.key",
106 "*.p12",
107 "*.pfx",
108 ".DS_Store",
109 "Thumbs.db",
110 ]
111
112 def _build_filename_filter(patterns: list[str]) -> re.Pattern[str] | None:
113 """Compile a combined regex for fast per-file ignore pre-rejection.
114
115 Translates every *simple* pattern (no ``/`` in the body) into a single
116 alternating regex so ``re.search(fname)`` can reject the overwhelming
117 majority of files in one call instead of N ``fnmatch`` calls.
118
119 Only patterns without ``/`` in their body are included — they test the
120 filename component. Patterns with an embedded ``/`` (e.g.
121 ``docs/*.md``) or a trailing ``/`` (directory patterns) must still go
122 through the full :func:`~muse.core.ignore.is_ignored` path.
123
124 Returns ``None`` when *patterns* is empty or contains no simple patterns.
125
126 Performance: at 75 000 files with 9 builtin patterns, replacing 9 × N
127 ``fnmatch`` calls with one ``re.search`` call reduces ignore-matching
128 overhead by ~10×, dropping warm ``walk_workdir`` time from ~850 ms to
129 ~85 ms at 75 k scale, making the 1-file-change target of < 200 ms
130 achievable.
131 """
132 translated: list[str] = []
133 for raw_pat in patterns:
134 body = raw_pat.lstrip("!") # strip negation marker
135 if body.endswith("/"):
136 body = body.rstrip("/")
137 if "/" in body:
138 continue # path-level pattern — needs full is_ignored evaluation
139 translated.append(fnmatch.translate(body))
140 if not translated:
141 return None
142 return re.compile(f"(?:{'|'.join(translated)})")
143
144 def load_ignore_patterns(workdir: pathlib.Path) -> list[str]:
145 """Return the combined ignore pattern list for *workdir*.
146
147 Reads ``.museignore`` from *workdir* and detects the active domain from
148 ``.muse/repo.json``. Falls back to ``"code"`` when either file is absent.
149 The built-in secrets blocklist is always prepended so it cannot be
150 overridden by user configuration.
151
152 This function is intentionally public so that commands outside
153 ``snapshot.py`` (e.g. ``shelf``) can apply the same ignore rules without
154 duplicating the domain-detection logic.
155 """
156 domain = "code"
157 repo_json = _repo_json_path(workdir)
158 if repo_json.exists():
159 raw = load_json_file(repo_json)
160 if isinstance(raw, dict) and isinstance(raw.get("domain"), str):
161 domain = raw["domain"]
162
163 config = load_ignore_config(workdir)
164 user_patterns = resolve_patterns(config, domain)
165 return _BUILTIN_SECRET_PATTERNS + user_patterns
166
167 _SEP = "\x00"
168
169 def build_snapshot_manifest(workdir: pathlib.Path) -> Manifest:
170 """Return ``{rel_path: object_id}`` for every tracked file in *workdir*.
171
172 Preferred public name; delegates to :func:`walk_workdir`.
173 """
174 return walk_workdir(workdir)
175
176 def directories_from_manifest(files: Manifest) -> list[str]:
177 """Derive all implicit parent directories from a file manifest.
178
179 For every file path in *files*, all ancestor directory components are
180 collected. The result is a sorted, deduplicated list of POSIX directory
181 paths relative to the repository root.
182
183 Empty directories that have no files are not present in *files* and
184 therefore cannot be derived here — they require an explicit ``.musekeep``
185 marker file so the filesystem walk in :func:`walk_workdir_with_dirs` can
186 detect them.
187
188 This helper is used by merge / rebase / cherry-pick operations that
189 compute a merged file manifest without performing a fresh filesystem
190 walk, so that every ``SnapshotRecord`` stores a consistent directory list.
191 """
192 dirs: set[str] = set()
193 for path in files:
194 parts = path.split("/")
195 for i in range(1, len(parts)):
196 dirs.add("/".join(parts[:i]))
197 return sorted(dirs)
198
199 def walk_workdir_with_dirs(
200 workdir: pathlib.Path,
201 ) -> tuple[Manifest, list[str]]:
202 """Walk *workdir* and return ``(files_manifest, sorted_directories)``.
203
204 A single ``os.walk`` pass collects both the file content map and the
205 list of every non-root directory encountered (minus always-pruned dirs).
206 This is the canonical entry point for commit and status operations that
207 need first-class directory identity.
208
209 See :func:`walk_workdir` for the exclusion rules that apply to files.
210 Directories follow the same pruning rules — any directory whose name is
211 in :data:`_ALWAYS_PRUNE_DIRS` is never descended into and therefore
212 never appears in the returned list.
213 """
214 ignore_patterns = load_ignore_patterns(workdir)
215 force_track = load_force_track_paths(workdir)
216 cache = load_cache(workdir)
217 manifest: Manifest = {}
218 dirs: list[str] = []
219 root_str = str(workdir)
220 prefix_len = len(root_str) + 1
221
222 _filename_filter: re.Pattern[str] | None = _build_filename_filter(ignore_patterns)
223 _has_complex_patterns: bool = any(
224 "/" in p.lstrip("!")
225 for p in ignore_patterns
226 )
227
228 for dirpath, dirnames, filenames in os.walk(root_str, followlinks=False):
229 # Prune always-excluded names and any subdirectory that is itself a
230 # nested muse repo (contains .muse/). Nested repos are independent
231 # version-controlled units — their contents belong to their own
232 # snapshot, not the parent repo's.
233 dirnames[:] = [
234 d for d in dirnames
235 if d not in _ALWAYS_PRUNE_DIRS
236 and not os.path.isdir(os.path.join(dirpath, d, ".muse"))
237 ]
238
239 # Track every non-root directory we descend into.
240 if dirpath != root_str:
241 rel_dir = dirpath[prefix_len:]
242 if os.sep != "/":
243 rel_dir = rel_dir.replace(os.sep, "/")
244 dirs.append(rel_dir)
245
246 for fname in filenames:
247 abs_str = os.path.join(dirpath, fname)
248 try:
249 st = os.lstat(abs_str)
250 except OSError:
251 continue
252 if not _stat.S_ISREG(st.st_mode):
253 continue
254 rel = abs_str[prefix_len:]
255 if os.sep != "/":
256 rel = rel.replace(os.sep, "/")
257 if rel in force_track:
258 manifest[rel] = cache.get_cached(rel, abs_str, st.st_mtime, st.st_size, st.st_ino)
259 continue
260 if (
261 _filename_filter is not None
262 and not _filename_filter.search(fname)
263 and not _has_complex_patterns
264 ):
265 manifest[rel] = cache.get_cached(rel, abs_str, st.st_mtime, st.st_size, st.st_ino)
266 continue
267 if is_ignored(rel, ignore_patterns):
268 continue
269 manifest[rel] = cache.get_cached(rel, abs_str, st.st_mtime, st.st_size, st.st_ino)
270
271 cache.prune(set(manifest))
272 cache.save()
273 return manifest, sorted(dirs)
274
275 def walk_workdir(workdir: pathlib.Path) -> Manifest:
276 """Walk *workdir* and return only the file manifest.
277
278 Thin wrapper around :func:`walk_workdir_with_dirs` that discards the
279 directory list. Callers that need both files and directories should call
280 :func:`walk_workdir_with_dirs` directly to avoid a second filesystem walk.
281
282 Walk *workdir* recursively and return ``{rel_path: object_id}``.
283
284 Exclusions (all silent, no warning emitted):
285 - Symlinks — following them could commit content from outside the repo.
286 - Non-regular files — only regular files are included.
287 - Paths matched by ``.museignore`` or the built-in secrets blocklist.
288 - Directories in ``_ALWAYS_PRUNE_DIRS`` — internal VCS storage and
289 universally-noisy directories (node_modules, __pycache__, .venv, …).
290
291 Dotfiles and dot-directories are tracked unless excluded by the above
292 rules. ``.cursorrules``, ``.editorconfig``, ``.eslintrc`` etc. are
293 intentional project configuration; the blanket dot-skip that Git-adjacent
294 tools inherited is not carried forward here.
295
296 Paths use POSIX separators regardless of host OS for cross-platform
297 reproducibility.
298
299 Performance note: ``os.walk`` with in-place ``dirnames`` pruning is used
300 instead of ``pathlib.rglob`` so that large noisy directories are never
301 descended into. The stat cache further skips re-hashing files whose
302 ``(mtime, size)`` is unchanged since the last walk.
303
304 Ignore-pattern fast path: patterns are compiled into a single combined
305 regex (see :func:`_build_filename_filter`) that is evaluated against the
306 bare filename once per file. For the builtin secrets blocklist (9 simple
307 ``*.ext`` / ``name`` patterns with no ``/``), this replaces 9 separate
308 ``fnmatch`` calls with one ``re.search`` call — a ~10× speedup at 75 k
309 scale that brings warm 1-file-change latency from ~850 ms to < 200 ms.
310 Files whose filename can't possibly match any pattern skip ``is_ignored``
311 entirely; files that might match (rare) fall through to the full check.
312 """
313 files, _ = walk_workdir_with_dirs(workdir)
314 return files
315
316 def snapshot_identity_bytes(
317 manifest: Manifest,
318 directories: list[str] | None = None,
319 ) -> bytes:
320 """Return the canonical payload whose sha256 equals the snapshot ID.
321
322 This is the preimage of compute_snapshot_id — the bytes stored in the
323 object store at key=snapshot_id so that DB presence implies blob presence
324 (the same content-addressing invariant enforced for file blobs and commits).
325
326 The null-byte separator prevents collisions from filenames or object IDs
327 that contain the previous ``|`` / ``:`` separators. Sorting ensures two
328 identical working trees always produce the same bytes regardless of
329 filesystem traversal order.
330 """
331 parts = sorted(
332 f"{path}{_SEP}{split_id(oid)[1]}" for path, oid in manifest.items()
333 )
334 if directories:
335 # Prefix directory entries with "dir" so they occupy a distinct namespace
336 # from file entries and cannot collide with path/oid pairs.
337 parts.extend(f"dir{_SEP}{d}" for d in sorted(directories))
338 return _SEP.join(parts).encode()
339
340
341 def compute_snapshot_id(
342 manifest: Manifest,
343 directories: list[str] | None = None,
344 ) -> str:
345 """Return sha256 of the sorted ``path NUL object_id`` pairs and directory paths.
346
347 Uses the git-style typed-object formula: ``sha256("snapshot <size>\\0" + canonical)``
348 where canonical is the null-separated path/oid pairs from :func:`snapshot_identity_bytes`.
349 """
350 canonical = snapshot_identity_bytes(manifest, directories)
351 header = f"snapshot {len(canonical)}\0".encode()
352 return "sha256:" + _hashlib.sha256(header + canonical).hexdigest()
353
354 def detect_directory_renames(
355 deleted_dirs: set[str],
356 added_dirs: set[str],
357 last_manifest: Manifest,
358 current_manifest: Manifest,
359 ) -> list[tuple[str, str]]:
360 """Return ``[(old_dir, new_dir)]`` pairs detected from manifest diffs.
361
362 A directory rename is inferred when all files that were under *old_dir*
363 in *last_manifest* appear under *new_dir* in *current_manifest* with
364 identical object IDs (same content, different path). Empty directories
365 and directories whose file sets do not match any added directory are not
366 returned.
367
368 The heuristic is conservative: only 1-to-1 renames are reported. If
369 multiple added directories share the same file set (unusual but possible),
370 the match is ambiguous and no rename is emitted for that pair.
371 """
372 renames: list[tuple[str, str]] = []
373 matched_new: set[str] = set()
374
375 for old_dir in sorted(deleted_dirs):
376 prefix = f"{old_dir}/"
377 old_files = {
378 path[len(prefix):]: oid
379 for path, oid in last_manifest.items()
380 if path.startswith(prefix)
381 }
382 if not old_files:
383 continue # empty dir — can't match by content
384
385 candidates = [
386 new_dir for new_dir in sorted(added_dirs)
387 if new_dir not in matched_new
388 ]
389 for new_dir in candidates:
390 new_prefix = f"{new_dir}/"
391 new_files = {
392 path[len(new_prefix):]: oid
393 for path, oid in current_manifest.items()
394 if path.startswith(new_prefix)
395 }
396 if new_files == old_files:
397 renames.append((old_dir, new_dir))
398 matched_new.add(new_dir)
399 break
400
401 return renames
402
403 def diff_workdir_vs_snapshot(
404 workdir: pathlib.Path,
405 last_manifest: Manifest,
406 last_directories: list[str] | None = None,
407 ) -> tuple[set[str], set[str], set[str], set[str], set[str], set[str]]:
408 """Compare *workdir* against *last_manifest* from the previous commit.
409
410 Returns a tuple of six disjoint path sets:
411
412 - ``added`` — files in *workdir* absent from *last_manifest*.
413 - ``modified`` — files present in both but with a differing sha256 hash.
414 - ``deleted`` — files in *last_manifest* absent from *workdir*.
415 - ``untracked`` — non-empty only when *last_manifest* is empty (first
416 commit): every file in *workdir* is untracked.
417 - ``added_dirs`` — directories present in *workdir* but not in
418 *last_directories*.
419 - ``deleted_dirs``— directories in *last_directories* absent from *workdir*.
420
421 All paths use POSIX separators for cross-platform reproducibility.
422 """
423 if not workdir.exists():
424 return (
425 set(), set(),
426 set(last_manifest.keys()), set(),
427 set(), set(last_directories or []),
428 )
429
430 current_manifest, current_dirs = walk_workdir_with_dirs(workdir)
431 current_paths = set(current_manifest.keys())
432 last_paths = set(last_manifest.keys())
433
434 if not last_paths:
435 return set(), set(), set(), current_paths, set(current_dirs), set()
436
437 added = current_paths - last_paths
438 deleted = last_paths - current_paths
439 common = current_paths & last_paths
440 modified = {p for p in common if current_manifest[p] != last_manifest[p]}
441
442 # A file that was tracked in the last snapshot but is now listed in
443 # .museignore and still present on disk is not "deleted" — it has been
444 # intentionally moved out of tracking. Reporting it as deleted would
445 # block checkout, pollute status output, and cause shelf pop to unlink it.
446 # Only files that are genuinely absent from the working tree are deleted.
447 if deleted:
448 ignore_patterns = load_ignore_patterns(workdir)
449 deleted = {
450 p for p in deleted
451 if not (is_ignored(p, ignore_patterns) and (workdir / p).exists())
452 }
453
454 last_dirs_set = set(last_directories or [])
455 current_dirs_set = set(current_dirs)
456 added_dirs = current_dirs_set - last_dirs_set
457 deleted_dirs = last_dirs_set - current_dirs_set
458
459 return added, modified, deleted, set(), added_dirs, deleted_dirs
460
461 def commit_identity_bytes(
462 parent_ids: list[str],
463 snapshot_id: str,
464 message: str,
465 committed_at_iso: str,
466 author: str = "",
467 signer_public_key: str = "",
468 ) -> bytes:
469 """Return the canonical payload whose sha256 equals the commit ID.
470
471 This is the preimage of compute_commit_id — the bytes stored in the object
472 store at key=commit_id so that DB presence implies blob presence (the same
473 content-addressing invariant enforced for file blobs).
474
475 Field order (null-byte separated):
476 parents, snapshot_id, message, committed_at, author, signer_public_key
477
478 Uses null bytes as field separators to prevent separator-injection attacks.
479 ``parent_ids`` is sorted before hashing so insertion order does not affect
480 determinism.
481 """
482 parts = [
483 _SEP.join(sorted(split_id(p)[1] for p in parent_ids)),
484 split_id(snapshot_id)[1] if snapshot_id else "",
485 message,
486 committed_at_iso,
487 author,
488 signer_public_key,
489 ]
490 return _SEP.join(parts).encode()
491
492
493 def compute_commit_id(
494 parent_ids: list[str],
495 snapshot_id: str,
496 message: str,
497 committed_at_iso: str,
498 author: str = "",
499 signer_public_key: str = "",
500 ) -> str:
501 """Return sha256 of the commit's canonical inputs (portable formula).
502
503 Uses the git-style typed-object formula: ``sha256("commit <size>\\0" + canonical)``
504 where canonical is the null-separated field payload from :func:`commit_identity_bytes`.
505 ``author`` and ``signer_public_key`` bind the commit ID to its origin identity —
506 preventing key-swap replay.
507 """
508 canonical = commit_identity_bytes(
509 parent_ids=parent_ids,
510 snapshot_id=snapshot_id,
511 message=message,
512 committed_at_iso=committed_at_iso,
513 author=author,
514 signer_public_key=signer_public_key,
515 )
516 header = f"commit {len(canonical)}\0".encode()
517 return "sha256:" + _hashlib.sha256(header + canonical).hexdigest()
File History 1 commit
sha256:84df9126d09aeec0b8f1b908f0b06c10913feec28f3514b382efb1ba6d619385 refactor: rename StructuredMergePlugin to AddressedMergePlu… Sonnet 4.6 minor 25 days ago