gabriel / muse public
repo.py python
447 lines 17.8 KB
Raw
sha256:81ae324db5ad375fbfe4834c6fcb378312cafad3cc92dec5d3e5c427306621a2 fix: remove commit_exists filter from have anchors — server… Sonnet 4.6 patch 20 days ago
1 """Repository detection utilities for the Muse CLI.
2
3 Walking up the directory tree to locate a ``.muse/`` directory is the
4 single most-called internal primitive. Every subcommand uses it. Keeping
5 the semantics clear (``None`` on miss, never raises) makes callers simpler
6 and test isolation easier (``MUSE_REPO_ROOT`` env-var override).
7
8 :func:`read_repo_id` is the canonical way to read the repository ID from
9 ``.muse/repo.json``. It replaces 73 copy-pasted ``_read_repo_id`` functions
10 that had diverged into four different error-handling behavioural variants.
11 The canonical implementation uses ``REPO_NOT_FOUND`` on a missing file and
12 ``INTERNAL_ERROR`` on a malformed file — errors go through the logger, not
13 stderr, so commands that want to emit a user-facing message can catch the
14 ``SystemExit`` and print their own context.
15
16 :func:`require_repo` performs a **startup GC sweep** on every invocation,
17 removing stale temp files left by a prior SIGKILL. All three temp-file
18 families used by Muse are covered:
19
20 * ``.obj-tmp-*`` / ``.restore-tmp-*`` — object-store shard directories
21 (delegated to :func:`muse.core.object_store.cleanup_stale_object_temps`)
22 * ``.muse-tmp-*`` — created by :func:`~muse.core.store.write_text_atomic`
23 and :func:`~muse.core.store._write_msgpack_atomic` in ``.muse/`` and its
24 subdirectories (branches, tags, releases, …)
25 * ``.stat_cache_*.tmp`` — created by :class:`~muse.core.stat_cache.StatCache`
26 in ``.muse/``
27 """
28
29 import datetime
30 import logging
31 import os
32 import pathlib
33 import sys
34
35 from muse.core.paths import muse_dir as _muse_dir, repo_json_path as _repo_json_path
36 from muse.core.types import load_json_file
37 from muse.core.errors import ExitCode, UntrustedRepositoryError
38 from muse.core.validation import assert_not_symlink
39
40 logger = logging.getLogger(__name__)
41
42 # Subdirectories of .muse/ (excluding objects/) that can hold stale temps.
43 # objects/ is handled separately by cleanup_stale_object_temps.
44 _MUSE_SWEEP_DIRS: tuple[str, ...] = (
45 "", # .muse/ root itself (HEAD, stat_cache, config, merge-state, etc.)
46 "branches", # write_branch_ref
47 "commits", # legacy — kept for stale-temp sweep on pre-migration repos
48 "snapshots", # legacy — kept for stale-temp sweep on pre-migration repos
49 "tags", # write_tag via write_json_atomic
50 "releases", # write_release via write_json_atomic
51 "refs/heads", # write_branch_ref alternative path
52 "code", # code-domain index writes
53 "indices", # index writes
54 "coordination", # create_intent / create_reservation
55 "worktrees", # _save_meta
56 "shelf", # shelf writes
57 "cache", # recomputable JSON caches (symbol, callgraph, stat, etc.)
58 "logs", # log writes
59 "remotes", # remote config writes
60 )
61
62 # File-name prefixes that identify stale temps within the above directories.
63 # .muse-tmp-: write_text_atomic (tags, releases, branches, refs)
64 # .stat_cache_: StatCache.save (mkstemp, suffix .tmp)
65 # .symbols_: SymbolCache.save (mkstemp, suffix .tmp)
66 # .callgraph_: CallGraphCache.save (mkstemp, suffix .tmp)
67 # .implicit_edges_: ImplicitEdgeCache.save (mkstemp, suffix .tmp)
68 # .invariants_: _InvariantFileCache.save (mkstemp, suffix .tmp)
69 _MUSE_TEMP_PREFIXES: tuple[str, ...] = (
70 ".muse-tmp-",
71 ".stat_cache_",
72 ".symbols_",
73 ".callgraph_",
74 ".implicit_edges_",
75 ".invariants_",
76 )
77
78 def _cleanup_muse_dir_temps(muse_dir: pathlib.Path) -> int:
79 """Remove stale temp files left by crashed cache saves.
80
81 Covers all six temp-file families produced by Muse writers:
82 ``.muse-tmp-*``, ``.stat_cache_*.tmp``, ``.symbols_*.tmp``,
83 ``.callgraph_*.tmp``, ``.implicit_edges_*.tmp``, ``.invariants_*.tmp``.
84
85 Iterates only the known subdirectory set that Muse writes to, so the
86 object store (handled separately) and user files in the workdir are
87 never touched.
88
89 Returns:
90 Number of stale temp files removed.
91 """
92 if not muse_dir.is_dir():
93 return 0
94 removed = 0
95 for subdir in _MUSE_SWEEP_DIRS:
96 target = muse_dir / subdir if subdir else muse_dir
97 # Skip symlinked subdirectories — never delete files inside an
98 # attacker-controlled location that was swapped in via symlink.
99 if not target.is_dir() or target.is_symlink():
100 continue
101 for entry in target.iterdir():
102 if entry.is_file() and any(
103 entry.name.startswith(pfx) for pfx in _MUSE_TEMP_PREFIXES
104 ):
105 try:
106 entry.unlink()
107 removed += 1
108 logger.warning(
109 "⚠️ Removed stale muse temp %s (left by prior crash)", entry
110 )
111 except OSError as exc:
112 logger.warning(
113 "⚠️ Could not remove stale temp %s: %s", entry, exc
114 )
115 return removed
116
117 _CRITICAL_MUSE_DIRS: tuple[str, ...] = (
118 "objects",
119 "refs",
120 "refs/heads",
121 "tags",
122 )
123
124 def _verify_muse_dir_integrity(muse_dir: pathlib.Path) -> None:
125 """Assert that critical ``.muse/`` subdirectories are real directories.
126
127 Any of these being a symlink would redirect writes to an attacker-
128 controlled location. Called by :func:`require_repo` on every invocation
129 so the check runs at the trust boundary, before any store operation.
130
131 Args:
132 muse_dir: Absolute path to the ``.muse/`` directory.
133
134 Raises:
135 SystemExit(1): If any critical subdirectory is a symbolic link.
136 """
137 for subname in _CRITICAL_MUSE_DIRS:
138 candidate = muse_dir / subname
139 if not candidate.exists():
140 continue # not yet created — first-use, not an attack
141 try:
142 assert_not_symlink(candidate, label=f".muse/{subname}")
143 except ValueError as exc:
144 logger.error("❌ %s", exc)
145 raise SystemExit(1) from exc
146
147 def _startup_gc(repo_root: pathlib.Path) -> None:
148 """Sweep all stale temp files left by a prior SIGKILL crash.
149
150 Called by :func:`require_repo` on every command invocation so that any
151 orphaned temp file from the previous crash is cleaned before the current
152 command reads or writes the store. The sweep is fast (< 5 ms on a
153 typical repo) because it only touches small, bounded directories.
154
155 Three temp-file families are covered:
156
157 1. Object-store temps (``.obj-tmp-*``, ``.restore-tmp-*``) via
158 :func:`~muse.core.object_store.cleanup_stale_object_temps`.
159 2. Store/config temps (``.muse-tmp-*``) via
160 :func:`_cleanup_muse_dir_temps`.
161 3. Stat-cache temps (``.stat_cache_*.tmp``) via the same sweep (the
162 ``.stat_cache_`` prefix is included in :data:`_MUSE_TEMP_PREFIXES`).
163 """
164 from muse.core.object_store import cleanup_stale_object_temps
165
166 cleanup_stale_object_temps(repo_root)
167 _cleanup_muse_dir_temps(_muse_dir(repo_root))
168
169 def _resolve_worktree_pointer(pointer_path: pathlib.Path) -> pathlib.Path | None:
170 """Read a ``.muse`` worktree pointer file and return the main repo root.
171
172 The file must contain a line of the form::
173
174 musestore: /absolute/path/to/main/.muse
175
176 Returns the parent of the ``.muse/`` store (i.e. the main repo root), or
177 ``None`` on any parse or validation failure. Never raises.
178 """
179 try:
180 text = pointer_path.read_text(encoding="utf-8", errors="strict").strip()
181 except Exception as exc:
182 logger.debug("Could not read worktree pointer %s: %s", pointer_path, exc)
183 return None
184
185 prefix = "musestore: "
186 if not text.startswith(prefix):
187 logger.debug("Worktree pointer %s has unexpected format: %r", pointer_path, text[:80])
188 return None
189
190 raw_store = text[len(prefix):].strip()
191 if not raw_store:
192 return None
193 if any(ord(c) < 0x20 or ord(c) == 0x7F for c in raw_store):
194 logger.warning("⚠️ Worktree pointer %s contains control characters — ignoring", pointer_path)
195 return None
196 if len(raw_store) > 4096:
197 logger.warning("⚠️ Worktree pointer %s path too long — ignoring", pointer_path)
198 return None
199
200 store_path = pathlib.Path(raw_store).resolve()
201 if not store_path.is_dir() or store_path.is_symlink():
202 logger.debug("Worktree pointer %s → %s is not a valid store dir", pointer_path, store_path)
203 return None
204
205 repo_root = store_path.parent
206 # Loop guard: resolved root must not be the worktree directory itself.
207 worktree_dir = pointer_path.parent.resolve()
208 if repo_root == worktree_dir:
209 logger.warning("⚠️ Worktree pointer %s loops back to its own directory", pointer_path)
210 return None
211
212 return repo_root
213
214 def _is_repo_trusted(repo_root: pathlib.Path) -> bool:
215 """Return ``True`` if *repo_root* is in the caller's trust list.
216
217 Checks two sources (in order):
218 1. ``MUSE_SAFE_DIRS`` environment variable — colon-separated absolute paths.
219 2. ``~/.muse/config.toml`` ``[security] safe_dirs`` list.
220
221 Root (uid == 0) is always trusted.
222 """
223 if os.getuid() == 0:
224 return True
225
226 canonical = str(repo_root.resolve())
227
228 # 1. MUSE_SAFE_DIRS env var (colon-separated, for Docker/CI).
229 env_raw = os.environ.get("MUSE_SAFE_DIRS", "")
230 if env_raw.strip():
231 for raw_dir in env_raw.split(":"):
232 if raw_dir.strip() and pathlib.Path(raw_dir.strip()).resolve() == pathlib.Path(canonical):
233 return True
234
235 # 2. ~/.muse/config.toml [security] safe_dirs.
236 try:
237 from muse.cli.config import get_global_safe_dirs
238 for safe_path in get_global_safe_dirs():
239 if pathlib.Path(safe_path).resolve() == pathlib.Path(canonical):
240 return True
241 except Exception: # noqa: BLE001
242 pass
243
244 return False
245
246 def _check_repo_ownership(repo_root: pathlib.Path) -> None:
247 """Raise :class:`~muse.core.errors.UntrustedRepositoryError` if ownership mismatch.
248
249 Implements a CVE-2022-24765–equivalent check: the ``.muse/`` directory must
250 be owned by the current user, or the repository must be explicitly trusted.
251
252 Skipped when:
253 - Current uid is 0 (root has unrestricted access anyway).
254 - The repo is in ``MUSE_SAFE_DIRS`` or ``~/.muse/config.toml`` safe_dirs.
255
256 Args:
257 repo_root: The repository root directory (parent of ``.muse/``).
258
259 Raises:
260 UntrustedRepositoryError: When owner UID does not match current UID and
261 the path is not in the trust list.
262 """
263 current_uid = os.getuid()
264 if current_uid == 0:
265 return # root bypass
266
267 muse_dir = _muse_dir(repo_root)
268 try:
269 st = muse_dir.stat()
270 except OSError:
271 # Can't stat — not a concern for ownership check; other code handles
272 # missing .muse/
273 return
274
275 owner_uid = st.st_uid
276 if owner_uid == current_uid:
277 return # owned by us — safe
278
279 # Different owner — check trust list before raising.
280 if _is_repo_trusted(repo_root):
281 return
282
283 raise UntrustedRepositoryError(
284 path=str(repo_root),
285 owner_uid=owner_uid,
286 current_uid=current_uid,
287 )
288
289 def find_repo_root(start: pathlib.Path | None = None) -> pathlib.Path | None:
290 """Walk up from *start* (default ``Path.cwd()``) looking for ``.muse/``.
291
292 Returns the first directory that contains ``.muse/``, or ``None`` if no
293 such ancestor exists. Never raises — callers decide what to do on miss.
294
295 The ``MUSE_REPO_ROOT`` environment variable overrides discovery entirely;
296 set it in tests to avoid ``os.chdir`` calls.
297
298 Security hardening for ``MUSE_REPO_ROOT``:
299 - Empty or whitespace-only values are silently ignored (falls through to
300 directory walk) rather than being resolved to the current working
301 directory, which would bypass the explicit intent to override.
302 - Values longer than the OS ``PATH_MAX`` (4096 on Linux/macOS) are
303 rejected — overly long paths indicate an injection attempt.
304 - Control characters in the value are rejected — they indicate a crafted
305 payload rather than a genuine file-system path.
306 - Symlinked ``.muse/`` directories are rejected even when the path comes
307 from the env override, consistent with the directory-walk path.
308
309 Ownership check (CVE-2022-24765 equivalent):
310 - After locating ``.muse/``, ``_check_repo_ownership`` verifies that the
311 directory is owned by the current user.
312 - Raises :class:`~muse.core.errors.UntrustedRepositoryError` on mismatch
313 unless the path is in ``MUSE_SAFE_DIRS`` or ``~/.muse/config.toml``.
314 """
315 raw_env = os.environ.get("MUSE_REPO_ROOT")
316 if raw_env is not None:
317 # Silently ignore empty or whitespace-only values — fall through to walk.
318 stripped = raw_env.strip()
319 if not stripped:
320 logger.debug("MUSE_REPO_ROOT is empty or whitespace — ignoring, using cwd walk")
321 else:
322 # Reject values containing control characters.
323 if any(ord(c) < 0x20 or ord(c) == 0x7F for c in stripped):
324 logger.warning(
325 "⚠️ MUSE_REPO_ROOT contains control characters — ignoring for safety"
326 )
327 return None
328 # Reject unreasonably long paths (OS PATH_MAX is 4096 on Linux/macOS).
329 if len(stripped) > 4096:
330 logger.warning(
331 "⚠️ MUSE_REPO_ROOT is too long (%d chars) — ignoring for safety",
332 len(stripped),
333 )
334 return None
335 p = pathlib.Path(stripped).resolve()
336 logger.debug("⚠️ MUSE_REPO_ROOT override active: %s", p)
337 muse_candidate = _muse_dir(p)
338 # Reject symlinked .muse/ even when the path comes from the env override.
339 if muse_candidate.is_dir() and not muse_candidate.is_symlink():
340 _check_repo_ownership(p)
341 return p
342 return None
343
344 current = (start or pathlib.Path.cwd()).resolve()
345 while True:
346 muse_dir = _muse_dir(current)
347 # Reject symlinked .muse/ — a symlink here redirects all subsequent
348 # writes to an attacker-controlled location outside the repo root.
349 if muse_dir.is_dir() and not muse_dir.is_symlink():
350 _check_repo_ownership(current)
351 return current
352 # Linked worktree: .muse is a file containing "musestore: /path/to/.muse"
353 if muse_dir.is_file() and not muse_dir.is_symlink():
354 resolved = _resolve_worktree_pointer(muse_dir)
355 if resolved is not None:
356 _check_repo_ownership(resolved)
357 return resolved
358 parent = current.parent
359 if parent == current:
360 return None
361 current = parent
362
363 _NOT_A_REPO_MSG = (
364 'fatal: not a muse repository (or any parent up to mount point /)\n'
365 'Run "muse init" to initialize a new repository.'
366 )
367
368 def require_repo(start: pathlib.Path | None = None) -> pathlib.Path:
369 """Return the repo root or exit 2 with a clear error message.
370
371 Wraps ``find_repo_root()`` for command callbacks that must be inside a
372 Muse repository. The error text is written to stderr so the shell always
373 surfaces it; our ``CliRunner`` merges stderr into ``result.output``.
374
375 **Startup GC sweep:** after locating the repo root, performs a lightweight
376 sweep of all ``.muse/`` subdirectories to remove stale temp files left by
377 a prior ``SIGKILL``. The sweep covers all three temp-file families
378 produced by the store layer (``.muse-tmp-*``, ``.stat_cache_*.tmp``,
379 ``.obj-tmp-*``, ``.restore-tmp-*``). The cost is < 5 ms on a typical
380 repo because only small, bounded directories are listed.
381 """
382 root = find_repo_root(start)
383 if root is None:
384 print(_NOT_A_REPO_MSG, file=sys.stderr)
385 raise SystemExit(ExitCode.REPO_NOT_FOUND)
386 _verify_muse_dir_integrity(_muse_dir(root))
387 _startup_gc(root)
388 return root
389
390 #: Public alias.
391 require_repo_root = require_repo
392
393 def parse_date_arg(value: str, flag: str) -> datetime.datetime:
394 """Parse an ISO-8601 date or datetime string from a CLI flag.
395
396 Accepts ``YYYY-MM-DD`` and ``YYYY-MM-DDTHH:MM:SS``. Always returns a
397 UTC-aware :class:`datetime.datetime`. Exits with code 1 and a clear
398 error message on parse failure, naming the offending *flag*.
399
400 This is the canonical implementation replacing per-command inline date
401 parsing that used different formats and error messages.
402
403 Args:
404 value: The raw string value from the CLI argument.
405 flag: The flag name (e.g. ``--since``) used in the error message.
406
407 Returns:
408 A UTC-aware :class:`datetime.datetime`.
409
410 Raises:
411 SystemExit(1): when *value* cannot be parsed as a recognised format.
412 """
413 for fmt in ("%Y-%m-%dT%H:%M:%S", "%Y-%m-%d"):
414 try:
415 return datetime.datetime.strptime(value, fmt).replace(
416 tzinfo=datetime.timezone.utc
417 )
418 except ValueError:
419 continue
420 print(
421 f"❌ Invalid date for {flag}: {value!r}"
422 " — expected YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS",
423 file=sys.stderr,
424 )
425 raise SystemExit(1)
426
427 def read_repo_id(repo_root: pathlib.Path) -> str:
428 """Read the ``repo_id`` from ``.muse/repo.json``.
429
430 This is the canonical implementation. It replaces 73 copy-pasted
431 ``_read_repo_id`` functions that had diverged into four different
432 error-handling variants across the codebase.
433
434 Raises:
435 SystemExit(REPO_NOT_FOUND): when ``.muse/repo.json`` does not exist.
436 SystemExit(INTERNAL_ERROR): when the file exists but is not valid JSON
437 or does not contain the expected ``repo_id`` key.
438 """
439 repo_json = _repo_json_path(repo_root)
440 if not repo_json.exists():
441 logger.debug(".muse/repo.json not found")
442 raise SystemExit(ExitCode.REPO_NOT_FOUND)
443 data = load_json_file(repo_json)
444 if data is None or not isinstance(data.get("repo_id"), str):
445 logger.debug(".muse/repo.json malformed")
446 raise SystemExit(ExitCode.INTERNAL_ERROR)
447 return str(data["repo_id"])
File History 4 commits
sha256:81ae324db5ad375fbfe4834c6fcb378312cafad3cc92dec5d3e5c427306621a2 fix: remove commit_exists filter from have anchors — server… Sonnet 4.6 patch 20 days ago
sha256:36c3cb3e76619d4c30a6d9bf81b5ec4ff148e30dcfed913e3114ca7b43b81c7e fix: rename objects→blobs in push client and all stale test… Sonnet 4.6 patch 22 days ago
sha256:c06a9b9b9fee26c68ea725b44d54b2c0a171301ce9de746d5b656617b4463a9a fix: repair four test failures from post-migration audit Sonnet 4.6 patch 28 days ago
sha256:1900655993c83c4107067375548a7be823e471d2515830842f1a12cba4bd3cdf fix: unified object store migration — idempotent writes, JS… Sonnet 4.6 minor 29 days ago