gabriel / muse public
content_grep.py python
592 lines 20.8 KB
Raw
sha256:18b983389ee1b55900fcd799bfbb496552d2e3ecded9d18cefbfef188947a12e chore: remove blob-debug test marker file Sonnet 4.6 23 hours ago
1 """``muse content-grep`` — full-text search across tracked files.
2
3 Searches the content of every tracked file for a pattern. By default the
4 search runs against the committed HEAD snapshot (reading from the object
5 store). Pass ``--working-tree`` to search the actual files on disk,
6 including uncommitted edits — essential for agents verifying their own
7 changes before committing.
8
9 ``--working-tree`` and ``--ref`` are mutually exclusive.
10
11 Binary files and non-UTF-8 files are silently skipped. Regex safety:
12 patterns are compiled with a 500-character length limit to prevent
13 catastrophic backtracking (ReDoS).
14
15 Performance (snapshot mode): object reads run in parallel using a bounded
16 ``ThreadPoolExecutor`` (``min(8, cpu_count())`` workers).
17
18 File filtering: ``--include`` and ``--exclude`` accept ``fnmatch``-style
19 glob patterns applied to relative file paths.
20
21 Usage::
22
23 muse content-grep "Cm7" # literal substring (HEAD)
24 muse content-grep "TODO" --working-tree # search working tree (disk)
25 muse content-grep "tempo:\\s+\\d+" # regex
26 muse content-grep "TODO" --ignore-case # case-insensitive
27 muse content-grep "chorus" --files-only # only file paths
28 muse content-grep "bass" --ref feat/audio # search a branch tip
29 muse content-grep "note" --include "*.txt" # only .txt files
30 muse content-grep "debug" --exclude "*.min.js"
31 muse content-grep "TODO" --max-matches 20 # cap results
32 muse content-grep "verse" --context 2 # 2 lines of context
33 muse content-grep "chord" --json # machine-readable
34
35 JSON output schema (``--json``)::
36
37 {
38 "source": "commit" | "working-tree",
39 "commit_id": "sha256:<64 hex>" | null,
40 "snapshot_id": "sha256:<64 hex>" | null,
41 "pattern": "<pattern string>",
42 "total_files_matched": <int>,
43 "total_matches": <int>,
44 "results": [
45 {
46 "path": "<relative path>",
47 "object_id": "sha256:<64 hex>" | null,
48 "match_count": <int>,
49 "matches": [
50 {
51 "line_number": <int>,
52 "line": "<matched line>",
53 "context_before": ["<line>", ...],
54 "context_after": ["<line>", ...]
55 },
56 ...
57 ]
58 },
59 ...
60 ],
61 "duration_ms": 0.001234,
62 "exit_code": 0
63 }
64
65 ``duration_ms``
66 Wall-clock time from argument parsing to output.
67 ``exit_code``
68 Mirrors the process exit code: ``0`` when matches found; ``1`` when no
69 matches.
70
71 Exit codes::
72
73 0 — pattern found in at least one file
74 1 — no matches (or no commits)
75 3 — I/O error
76 """
77
78 import argparse
79 import concurrent.futures
80 import fnmatch
81 import json
82 import logging
83 import os
84 import pathlib
85 import re
86 import sys
87 from collections.abc import Callable
88 from typing import TypedDict
89
90 from muse.core.errors import ExitCode
91 from muse.core.object_store import read_object
92 from muse.core.repo import require_repo
93 from muse.core.refs import (
94 get_head_commit_id,
95 read_current_branch,
96 )
97 from muse.core.commits import (
98 read_commit,
99 resolve_commit_ref,
100 )
101 from muse.core.snapshots import read_snapshot
102 from muse.core.validation import sanitize_display
103 from muse.core.envelope import EnvelopeJson, make_envelope
104 from muse.core.timing import start_timer
105
106 logger = logging.getLogger(__name__)
107
108 _BINARY_CHUNK = 8192
109 _MAX_PATTERN_LEN = 500 # reject patterns that could cause catastrophic backtracking
110 _DEFAULT_MAX_WORKERS = min(8, (os.cpu_count() or 1))
111
112 # Directories to skip when walking the working tree.
113 _SKIP_DIRS: frozenset[str] = frozenset({
114 ".muse",
115 ".git",
116 "__pycache__",
117 ".mypy_cache",
118 ".pytest_cache",
119 ".tox",
120 "node_modules",
121 ".venv",
122 "venv",
123 ".env",
124 })
125
126 # ---------------------------------------------------------------------------
127 # TypedDicts for structured output
128 # ---------------------------------------------------------------------------
129
130 class GrepMatch(TypedDict):
131 """A single matching line within a file, with optional surrounding context."""
132
133 line_number: int
134 line: str
135 context_before: list[str]
136 context_after: list[str]
137
138 class GrepFileResult(TypedDict):
139 """All matches within a single file."""
140
141 path: str
142 object_id: str | None # None when source is working-tree
143 match_count: int
144 matches: list[GrepMatch]
145
146 class _ContentGrepJson(EnvelopeJson):
147 """Top-level JSON output for ``muse content-grep --json``."""
148
149 source: str # "commit" | "working-tree"
150 commit_id: str | None
151 snapshot_id: str | None
152 pattern: str
153 total_files_matched: int
154 total_matches: int
155 results: list[GrepFileResult]
156
157 # ---------------------------------------------------------------------------
158 # Internal helpers
159 # ---------------------------------------------------------------------------
160
161 def _is_binary(data: bytes) -> bool:
162 """Return ``True`` if *data* (the first chunk) contains null bytes."""
163 return b"\x00" in data
164
165 def _path_matches_globs(rel_path: str, include: str | None, exclude: str | None) -> bool:
166 """Return ``True`` if *rel_path* passes the include/exclude glob filters.
167
168 ``--include`` and ``--exclude`` use ``fnmatch`` on the basename **and** on
169 the full relative path so that patterns like ``*.py`` and ``src/*.py`` both
170 work intuitively.
171 """
172 basename = pathlib.PurePosixPath(rel_path).name
173 if include is not None:
174 if not (fnmatch.fnmatch(basename, include) or fnmatch.fnmatch(rel_path, include)):
175 return False
176 if exclude is not None:
177 if fnmatch.fnmatch(basename, exclude) or fnmatch.fnmatch(rel_path, exclude):
178 return False
179 return True
180
181 def _search_lines(
182 raw: bytes,
183 pattern: re.Pattern[str],
184 files_only: bool,
185 count_only: bool,
186 context_lines: int,
187 ) -> tuple[int, list[GrepMatch]]:
188 """Search *raw* bytes for *pattern*; return ``(match_count, matches)``.
189
190 Binary content and non-UTF-8 content return ``(0, [])``.
191 """
192 probe = raw[:_BINARY_CHUNK]
193 if _is_binary(probe):
194 return 0, []
195
196 text = raw.decode("utf-8", errors="replace")
197 all_lines = text.splitlines()
198
199 matches: list[GrepMatch] = []
200 total = 0
201 for lineno, line in enumerate(all_lines, start=1):
202 if pattern.search(line):
203 total += 1
204 if not files_only and not count_only:
205 before: list[str] = []
206 after: list[str] = []
207 if context_lines > 0:
208 idx = lineno - 1 # 0-based index
209 before = [
210 l.rstrip("\r")
211 for l in all_lines[max(0, idx - context_lines) : idx]
212 ]
213 after = [
214 l.rstrip("\r")
215 for l in all_lines[idx + 1 : idx + 1 + context_lines]
216 ]
217 matches.append(
218 GrepMatch(
219 line_number=lineno,
220 line=line.rstrip("\r"),
221 context_before=before,
222 context_after=after,
223 )
224 )
225
226 return total, matches
227
228 def _search_object(
229 root_path: pathlib.Path,
230 object_id: str,
231 pattern: re.Pattern[str],
232 files_only: bool,
233 count_only: bool,
234 context_lines: int,
235 ) -> tuple[int, list[GrepMatch]]:
236 """Search a committed object for *pattern*; return ``(match_count, matches)``."""
237 try:
238 raw = read_object(root_path, object_id)
239 except OSError as exc:
240 logger.warning("⚠️ grep: could not read object %s: %s", object_id, exc)
241 return 0, []
242
243 if raw is None:
244 return 0, []
245
246 return _search_lines(raw, pattern, files_only, count_only, context_lines)
247
248 def _search_disk_file(
249 abs_path: pathlib.Path,
250 pattern: re.Pattern[str],
251 files_only: bool,
252 count_only: bool,
253 context_lines: int,
254 ) -> tuple[int, list[GrepMatch]]:
255 """Search a file on disk for *pattern*; return ``(match_count, matches)``."""
256 try:
257 raw = abs_path.read_bytes()
258 except OSError as exc:
259 logger.warning("⚠️ grep: could not read %s: %s", abs_path, exc)
260 return 0, []
261
262 return _search_lines(raw, pattern, files_only, count_only, context_lines)
263
264 def _walk_working_tree(
265 root: pathlib.Path,
266 include_glob: str | None,
267 exclude_glob: str | None,
268 ) -> list[tuple[str, pathlib.Path]]:
269 """Walk *root* recursively, skipping VCS/cache dirs, return ``(rel_path, abs_path)`` pairs."""
270 results: list[tuple[str, pathlib.Path]] = []
271 for dirpath, dirnames, filenames in os.walk(root):
272 # Prune directories in-place so os.walk skips them entirely.
273 dirnames[:] = sorted(
274 d for d in dirnames
275 if d not in _SKIP_DIRS
276 and not d.startswith(".")
277 and not os.path.isdir(os.path.join(dirpath, d, ".muse"))
278 )
279 for filename in sorted(filenames):
280 abs_path = pathlib.Path(dirpath) / filename
281 try:
282 rel_path = abs_path.relative_to(root).as_posix()
283 except ValueError:
284 continue
285 if _path_matches_globs(rel_path, include_glob, exclude_glob):
286 results.append((rel_path, abs_path))
287 return results
288
289 # ---------------------------------------------------------------------------
290 # Registration
291 # ---------------------------------------------------------------------------
292
293 def register(subparsers: "argparse._SubParsersAction[argparse.ArgumentParser]") -> None:
294 """Register the content-grep subcommand."""
295 parser = subparsers.add_parser(
296 "content-grep",
297 help="Search tracked file content for a pattern.",
298 description=__doc__,
299 formatter_class=argparse.RawDescriptionHelpFormatter,
300 )
301 parser.add_argument(
302 "pattern",
303 help="Regular expression pattern to search for.",
304 )
305 parser.add_argument(
306 "--working-tree", "-w", action="store_true", dest="working_tree",
307 help=(
308 "Search files on disk (working tree) instead of the committed HEAD snapshot. "
309 "Finds matches in uncommitted edits. Mutually exclusive with --ref."
310 ),
311 )
312 parser.add_argument(
313 "--ref", default=None,
314 help="Branch, tag, or commit SHA to search (default: HEAD). Mutually exclusive with --working-tree.",
315 )
316 parser.add_argument(
317 "--ignore-case", "-i", action="store_true", dest="ignore_case",
318 help="Case-insensitive matching.",
319 )
320 parser.add_argument(
321 "--files-only", "-l", action="store_true", dest="files_only",
322 help="Print only file paths with matches.",
323 )
324 parser.add_argument(
325 "--count", "-c", action="store_true", dest="count_mode",
326 help="Print only match counts per file.",
327 )
328 parser.add_argument(
329 "--include", default=None, dest="include_glob",
330 help="Only search files whose path matches this fnmatch glob (e.g. '*.py').",
331 )
332 parser.add_argument(
333 "--exclude", default=None, dest="exclude_glob",
334 help="Skip files whose path matches this fnmatch glob (e.g. '*.min.js').",
335 )
336 parser.add_argument(
337 "--max-matches", "-m", type=int, default=None, dest="max_matches",
338 help="Stop after this many total matches across all files.",
339 )
340 parser.add_argument(
341 "--context", "-C", type=int, default=0, dest="context_lines",
342 help="Number of surrounding lines to include with each match (like grep -C).",
343 )
344 parser.add_argument(
345 "--json", "-j", action="store_true", dest="json_out",
346 help="Emit machine-readable JSON.",
347 )
348 parser.set_defaults(func=run)
349
350 # ---------------------------------------------------------------------------
351 # Subcommand handler
352 # ---------------------------------------------------------------------------
353
354 def run(args: argparse.Namespace) -> None:
355 """Search tracked file content for a regex pattern.
356
357 Uses Python ``re`` (ERE syntax) — metacharacters must be escaped for literal
358 matches. Without ``--working-tree`` searches the committed snapshot; with
359 ``--working-tree`` reads from disk so uncommitted edits are visible. Binary
360 and non-UTF-8 files are silently skipped. Exit 0 = at least one match.
361
362 Agent quickstart
363 ----------------
364 ::
365
366 muse content-grep "TODO|FIXME" --json
367 muse content-grep "TODO" --working-tree --json
368 muse content-grep "session\\.add" --files-only --json
369
370 JSON fields
371 -----------
372 pattern Pattern searched.
373 total_matches Total number of line matches.
374 total_files Number of files with at least one match.
375 matches List of match objects: ``file``, ``line`` (1-based),
376 ``text`` (matched line), ``context_before``,
377 ``context_after``.
378 truncated ``true`` if ``--max-matches`` was reached.
379
380 Exit codes
381 ----------
382 0 At least one match found.
383 1 No matches; or invalid pattern / conflicting flags.
384 2 Not inside a Muse repository.
385 """
386 elapsed = start_timer()
387 pattern: str = args.pattern
388 working_tree: bool = args.working_tree
389 ref: str | None = args.ref
390 ignore_case: bool = args.ignore_case
391 files_only: bool = args.files_only
392 count_mode: bool = args.count_mode
393 include_glob: str | None = args.include_glob
394 exclude_glob: str | None = args.exclude_glob
395 max_matches: int | None = args.max_matches
396 context_lines: int = max(0, args.context_lines)
397 json_out: bool = args.json_out
398
399 if working_tree and ref is not None:
400 print("❌ --working-tree and --ref are mutually exclusive.", file=sys.stderr)
401 raise SystemExit(ExitCode.USER_ERROR)
402
403 # Validate pattern BEFORE any I/O — cheap rejection of bad inputs.
404 if len(pattern) > _MAX_PATTERN_LEN:
405 print(
406 f"❌ Pattern too long ({len(pattern)} chars, max {_MAX_PATTERN_LEN}). "
407 "Use a shorter pattern or re.escape() for literal matches.",
408 file=sys.stderr,
409 )
410 raise SystemExit(ExitCode.USER_ERROR)
411
412 # Normalize BRE-style \| to ERE-style | so agents with grep muscle memory
413 # get the expected behaviour without needing to know the distinction.
414 pattern = pattern.replace(r"\|", "|")
415
416 flags = re.IGNORECASE if ignore_case else 0
417 try:
418 compiled: re.Pattern[str] = re.compile(pattern, flags)
419 except re.error as exc:
420 print(f"❌ Invalid regex: {exc}", file=sys.stderr)
421 raise SystemExit(ExitCode.USER_ERROR) from exc
422
423 root = require_repo()
424
425 # ── Working-tree mode ────────────────────────────────────────────────────
426 if working_tree:
427 disk_files = _walk_working_tree(root, include_glob, exclude_glob)
428
429 file_results: list[GrepFileResult] = []
430 total_matches = 0
431
432 for rel_path, abs_path in disk_files:
433 if max_matches is not None and total_matches >= max_matches:
434 break
435 match_count, matches = _search_disk_file(
436 abs_path, compiled, files_only, count_mode, context_lines
437 )
438 if match_count > 0:
439 if max_matches is not None:
440 remaining = max_matches - total_matches
441 if match_count > remaining:
442 matches = matches[:remaining]
443 match_count = len(matches)
444 file_results.append(
445 GrepFileResult(
446 path=rel_path,
447 object_id=None,
448 match_count=match_count,
449 matches=matches,
450 )
451 )
452 total_matches += match_count
453
454 _emit(
455 file_results=file_results,
456 total_matches=total_matches,
457 source="working-tree",
458 commit_id=None,
459 snapshot_id=None,
460 pattern=pattern,
461 json_out=json_out,
462 files_only=files_only,
463 count_mode=count_mode,
464 context_lines=context_lines,
465 elapsed=elapsed,
466 )
467 if not file_results:
468 raise SystemExit(ExitCode.USER_ERROR) # exit 1 = no matches
469 return
470
471 # ── Snapshot mode (default) ──────────────────────────────────────────────
472 branch = read_current_branch(root)
473
474 if ref is None:
475 commit_id = get_head_commit_id(root, branch)
476 if commit_id is None:
477 print("❌ No commits on current branch.", file=sys.stderr)
478 raise SystemExit(ExitCode.USER_ERROR)
479 else:
480 commit_rec = resolve_commit_ref(root, branch, ref)
481 if commit_rec is None:
482 print(f"❌ Ref '{sanitize_display(ref)}' not found.", file=sys.stderr)
483 raise SystemExit(ExitCode.USER_ERROR)
484 commit_id = commit_rec.commit_id
485
486 commit = read_commit(root, commit_id)
487 if commit is None:
488 print(f"❌ Commit {commit_id} not found.", file=sys.stderr)
489 raise SystemExit(ExitCode.INTERNAL_ERROR)
490
491 snap = read_snapshot(root, commit.snapshot_id)
492 if snap is None:
493 print(f"❌ Snapshot {commit.snapshot_id} not found.", file=sys.stderr)
494 raise SystemExit(ExitCode.INTERNAL_ERROR)
495
496 filtered: list[tuple[str, str]] = [
497 (rel_path, object_id)
498 for rel_path, object_id in sorted(snap.manifest.items())
499 if _path_matches_globs(rel_path, include_glob, exclude_glob)
500 ]
501
502 def _search(item: tuple[str, str]) -> tuple[str, str, int, list[GrepMatch]]:
503 rel_path, object_id = item
504 cnt, ms = _search_object(root, object_id, compiled, files_only, count_mode, context_lines)
505 return rel_path, object_id, cnt, ms
506
507 snap_results: list[GrepFileResult] = []
508 snap_total = 0
509
510 with concurrent.futures.ThreadPoolExecutor(max_workers=_DEFAULT_MAX_WORKERS) as pool:
511 for rel_path, object_id, match_count, matches in pool.map(_search, filtered):
512 if match_count > 0:
513 if max_matches is not None:
514 remaining = max_matches - snap_total
515 if remaining <= 0:
516 break
517 if match_count > remaining:
518 matches = matches[:remaining]
519 match_count = len(matches)
520 snap_results.append(
521 GrepFileResult(
522 path=rel_path,
523 object_id=object_id,
524 match_count=match_count,
525 matches=matches,
526 )
527 )
528 snap_total += match_count
529
530 _emit(
531 file_results=snap_results,
532 total_matches=snap_total,
533 source="commit",
534 commit_id=commit_id,
535 snapshot_id=commit.snapshot_id,
536 pattern=pattern,
537 json_out=json_out,
538 files_only=files_only,
539 count_mode=count_mode,
540 context_lines=context_lines,
541 elapsed=elapsed,
542 )
543 if not snap_results:
544 raise SystemExit(ExitCode.USER_ERROR) # exit 1 = no matches
545
546 # ---------------------------------------------------------------------------
547 # Output helper (shared between modes)
548 # ---------------------------------------------------------------------------
549
550 def _emit(
551 *,
552 file_results: list[GrepFileResult],
553 total_matches: int,
554 source: str,
555 commit_id: str | None,
556 snapshot_id: str | None,
557 pattern: str,
558 json_out: bool,
559 files_only: bool,
560 count_mode: bool,
561 context_lines: int,
562 elapsed: Callable[[], float],
563 ) -> None:
564 """Render search results to stdout in text or JSON format."""
565 if json_out:
566 print(json.dumps(_ContentGrepJson(
567 **make_envelope(elapsed),
568 source=source,
569 commit_id=commit_id,
570 snapshot_id=snapshot_id,
571 pattern=pattern,
572 total_files_matched=len(file_results),
573 total_matches=total_matches,
574 results=file_results,
575 )))
576 else:
577 for fr in file_results:
578 safe_path = sanitize_display(fr["path"])
579 if files_only:
580 print(safe_path)
581 elif count_mode:
582 print(f"{safe_path}:{fr['match_count']}")
583 else:
584 for m in fr["matches"]:
585 if context_lines > 0:
586 for ctx in m["context_before"]:
587 print(f"{safe_path}:{m['line_number']}-{sanitize_display(ctx)}")
588 print(f"{safe_path}:{m['line_number']}:{sanitize_display(m['line'])}")
589 for ctx in m["context_after"]:
590 print(f"{safe_path}:{m['line_number']}+{sanitize_display(ctx)}")
591 else:
592 print(f"{safe_path}:{m['line_number']}:{sanitize_display(m['line'])}")
File History 7 commits
sha256:18b983389ee1b55900fcd799bfbb496552d2e3ecded9d18cefbfef188947a12e chore: remove blob-debug test marker file Sonnet 4.6 23 hours ago
sha256:e452ad9a6ace6ccc6d875a35e06caf9da5576a970c1c36133b69a891ce5fefa8 chore: prebuild timing test Sonnet 4.6 8 days ago
sha256:0008ab6695e3e064b3e236b24fd19e538fef6a588eb0d211622f4466d919c0b1 merge: pull staging/dev — advance to 0.2.0rc12 Sonnet 4.6 patch 10 days ago
sha256:9c33d61749fff814c5226d5386aa2af7064c2c02788594a25fdd709358132eea fix: _PROPOSAL_PREFIX_RESOLVE_LIMIT 200 → 100 to match hub … Sonnet 4.6 21 days ago
sha256:36c3cb3e76619d4c30a6d9bf81b5ec4ff148e30dcfed913e3114ca7b43b81c7e fix: rename objects→blobs in push client and all stale test… Sonnet 4.6 patch 24 days ago
sha256:c06a9b9b9fee26c68ea725b44d54b2c0a171301ce9de746d5b656617b4463a9a fix: repair four test failures from post-migration audit Sonnet 4.6 patch 30 days ago
sha256:1900655993c83c4107067375548a7be823e471d2515830842f1a12cba4bd3cdf fix: unified object store migration — idempotent writes, JS… Sonnet 4.6 minor 30 days ago