muse/cli/commands/content_grep.py · gabriel/muse

content_grep.py python

592 lines 20.8 KB

sha256:18b983389ee1b55900fcd799bfbb496552d2e3ecded9d18cefbfef188947a12e chore: remove blob-debug test marker file Sonnet 4.6 23 hours ago

1	"""``muse content-grep`` — full-text search across tracked files.
2
3	Searches the content of every tracked file for a pattern. By default the
4	search runs against the committed HEAD snapshot (reading from the object
5	store). Pass ``--working-tree`` to search the actual files on disk,
6	including uncommitted edits — essential for agents verifying their own
7	changes before committing.
8
9	``--working-tree`` and ``--ref`` are mutually exclusive.
10
11	Binary files and non-UTF-8 files are silently skipped. Regex safety:
12	patterns are compiled with a 500-character length limit to prevent
13	catastrophic backtracking (ReDoS).
14
15	Performance (snapshot mode): object reads run in parallel using a bounded
16	``ThreadPoolExecutor`` (``min(8, cpu_count())`` workers).
17
18	File filtering: ``--include`` and ``--exclude`` accept ``fnmatch``-style
19	glob patterns applied to relative file paths.
20
21	Usage::
22
23	muse content-grep "Cm7" # literal substring (HEAD)
24	muse content-grep "TODO" --working-tree # search working tree (disk)
25	muse content-grep "tempo:\\s+\\d+" # regex
26	muse content-grep "TODO" --ignore-case # case-insensitive
27	muse content-grep "chorus" --files-only # only file paths
28	muse content-grep "bass" --ref feat/audio # search a branch tip
29	muse content-grep "note" --include "*.txt" # only .txt files
30	muse content-grep "debug" --exclude "*.min.js"
31	muse content-grep "TODO" --max-matches 20 # cap results
32	muse content-grep "verse" --context 2 # 2 lines of context
33	muse content-grep "chord" --json # machine-readable
34
35	JSON output schema (``--json``)::
36
37	{
38	"source": "commit" \| "working-tree",
39	"commit_id": "sha256:<64 hex>" \| null,
40	"snapshot_id": "sha256:<64 hex>" \| null,
41	"pattern": "<pattern string>",
42	"total_files_matched": <int>,
43	"total_matches": <int>,
44	"results": [
45	{
46	"path": "<relative path>",
47	"object_id": "sha256:<64 hex>" \| null,
48	"match_count": <int>,
49	"matches": [
50	{
51	"line_number": <int>,
52	"line": "<matched line>",
53	"context_before": ["<line>", ...],
54	"context_after": ["<line>", ...]
55	},
56	...
57	]
58	},
59	...
60	],
61	"duration_ms": 0.001234,
62	"exit_code": 0
63	}
64
65	``duration_ms``
66	Wall-clock time from argument parsing to output.
67	``exit_code``
68	Mirrors the process exit code: ``0`` when matches found; ``1`` when no
69	matches.
70
71	Exit codes::
72
73	0 — pattern found in at least one file
74	1 — no matches (or no commits)
75	3 — I/O error
76	"""
77
78	import argparse
79	import concurrent.futures
80	import fnmatch
81	import json
82	import logging
83	import os
84	import pathlib
85	import re
86	import sys
87	from collections.abc import Callable
88	from typing import TypedDict
89
90	from muse.core.errors import ExitCode
91	from muse.core.object_store import read_object
92	from muse.core.repo import require_repo
93	from muse.core.refs import (
94	get_head_commit_id,
95	read_current_branch,
96	)
97	from muse.core.commits import (
98	read_commit,
99	resolve_commit_ref,
100	)
101	from muse.core.snapshots import read_snapshot
102	from muse.core.validation import sanitize_display
103	from muse.core.envelope import EnvelopeJson, make_envelope
104	from muse.core.timing import start_timer
105
106	logger = logging.getLogger(__name__)
107
108	_BINARY_CHUNK = 8192
109	_MAX_PATTERN_LEN = 500 # reject patterns that could cause catastrophic backtracking
110	_DEFAULT_MAX_WORKERS = min(8, (os.cpu_count() or 1))
111
112	# Directories to skip when walking the working tree.
113	_SKIP_DIRS: frozenset[str] = frozenset({
114	".muse",
115	".git",
116	"__pycache__",
117	".mypy_cache",
118	".pytest_cache",
119	".tox",
120	"node_modules",
121	".venv",
122	"venv",
123	".env",
124	})
125
126	# ---------------------------------------------------------------------------
127	# TypedDicts for structured output
128	# ---------------------------------------------------------------------------
129
130	class GrepMatch(TypedDict):
131	"""A single matching line within a file, with optional surrounding context."""
132
133	line_number: int
134	line: str
135	context_before: list[str]
136	context_after: list[str]
137
138	class GrepFileResult(TypedDict):
139	"""All matches within a single file."""
140
141	path: str
142	object_id: str \| None # None when source is working-tree
143	match_count: int
144	matches: list[GrepMatch]
145
146	class _ContentGrepJson(EnvelopeJson):
147	"""Top-level JSON output for ``muse content-grep --json``."""
148
149	source: str # "commit" \| "working-tree"
150	commit_id: str \| None
151	snapshot_id: str \| None
152	pattern: str
153	total_files_matched: int
154	total_matches: int
155	results: list[GrepFileResult]
156
157	# ---------------------------------------------------------------------------
158	# Internal helpers
159	# ---------------------------------------------------------------------------
160
161	def _is_binary(data: bytes) -> bool:
162	"""Return ``True`` if data (the first chunk) contains null bytes."""
163	return b"\x00" in data
164
165	def _path_matches_globs(rel_path: str, include: str \| None, exclude: str \| None) -> bool:
166	"""Return ``True`` if rel_path passes the include/exclude glob filters.
167
168	``--include`` and ``--exclude`` use ``fnmatch`` on the basename and on
169	the full relative path so that patterns like ``.py`` and ``src/.py`` both
170	work intuitively.
171	"""
172	basename = pathlib.PurePosixPath(rel_path).name
173	if include is not None:
174	if not (fnmatch.fnmatch(basename, include) or fnmatch.fnmatch(rel_path, include)):
175	return False
176	if exclude is not None:
177	if fnmatch.fnmatch(basename, exclude) or fnmatch.fnmatch(rel_path, exclude):
178	return False
179	return True
180
181	def _search_lines(
182	raw: bytes,
183	pattern: re.Pattern[str],
184	files_only: bool,
185	count_only: bool,
186	context_lines: int,
187	) -> tuple[int, list[GrepMatch]]:
188	"""Search raw bytes for pattern; return ``(match_count, matches)``.
189
190	Binary content and non-UTF-8 content return ``(0, [])``.
191	"""
192	probe = raw[:_BINARY_CHUNK]
193	if _is_binary(probe):
194	return 0, []
195
196	text = raw.decode("utf-8", errors="replace")
197	all_lines = text.splitlines()
198
199	matches: list[GrepMatch] = []
200	total = 0
201	for lineno, line in enumerate(all_lines, start=1):
202	if pattern.search(line):
203	total += 1
204	if not files_only and not count_only:
205	before: list[str] = []
206	after: list[str] = []
207	if context_lines > 0:
208	idx = lineno - 1 # 0-based index
209	before = [
210	l.rstrip("\r")
211	for l in all_lines[max(0, idx - context_lines) : idx]
212	]
213	after = [
214	l.rstrip("\r")
215	for l in all_lines[idx + 1 : idx + 1 + context_lines]
216	]
217	matches.append(
218	GrepMatch(
219	line_number=lineno,
220	line=line.rstrip("\r"),
221	context_before=before,
222	context_after=after,
223	)
224	)
225
226	return total, matches
227
228	def _search_object(
229	root_path: pathlib.Path,
230	object_id: str,
231	pattern: re.Pattern[str],
232	files_only: bool,
233	count_only: bool,
234	context_lines: int,
235	) -> tuple[int, list[GrepMatch]]:
236	"""Search a committed object for pattern; return ``(match_count, matches)``."""
237	try:
238	raw = read_object(root_path, object_id)
239	except OSError as exc:
240	logger.warning("⚠️ grep: could not read object %s: %s", object_id, exc)
241	return 0, []
242
243	if raw is None:
244	return 0, []
245
246	return _search_lines(raw, pattern, files_only, count_only, context_lines)
247
248	def _search_disk_file(
249	abs_path: pathlib.Path,
250	pattern: re.Pattern[str],
251	files_only: bool,
252	count_only: bool,
253	context_lines: int,
254	) -> tuple[int, list[GrepMatch]]:
255	"""Search a file on disk for pattern; return ``(match_count, matches)``."""
256	try:
257	raw = abs_path.read_bytes()
258	except OSError as exc:
259	logger.warning("⚠️ grep: could not read %s: %s", abs_path, exc)
260	return 0, []
261
262	return _search_lines(raw, pattern, files_only, count_only, context_lines)
263
264	def _walk_working_tree(
265	root: pathlib.Path,
266	include_glob: str \| None,
267	exclude_glob: str \| None,
268	) -> list[tuple[str, pathlib.Path]]:
269	"""Walk root recursively, skipping VCS/cache dirs, return ``(rel_path, abs_path)`` pairs."""
270	results: list[tuple[str, pathlib.Path]] = []
271	for dirpath, dirnames, filenames in os.walk(root):
272	# Prune directories in-place so os.walk skips them entirely.
273	dirnames[:] = sorted(
274	d for d in dirnames
275	if d not in _SKIP_DIRS
276	and not d.startswith(".")
277	and not os.path.isdir(os.path.join(dirpath, d, ".muse"))
278	)
279	for filename in sorted(filenames):
280	abs_path = pathlib.Path(dirpath) / filename
281	try:
282	rel_path = abs_path.relative_to(root).as_posix()
283	except ValueError:
284	continue
285	if _path_matches_globs(rel_path, include_glob, exclude_glob):
286	results.append((rel_path, abs_path))
287	return results
288
289	# ---------------------------------------------------------------------------
290	# Registration
291	# ---------------------------------------------------------------------------
292
293	def register(subparsers: "argparse._SubParsersAction[argparse.ArgumentParser]") -> None:
294	"""Register the content-grep subcommand."""
295	parser = subparsers.add_parser(
296	"content-grep",
297	help="Search tracked file content for a pattern.",
298	description=__doc__,
299	formatter_class=argparse.RawDescriptionHelpFormatter,
300	)
301	parser.add_argument(
302	"pattern",
303	help="Regular expression pattern to search for.",
304	)
305	parser.add_argument(
306	"--working-tree", "-w", action="store_true", dest="working_tree",
307	help=(
308	"Search files on disk (working tree) instead of the committed HEAD snapshot. "
309	"Finds matches in uncommitted edits. Mutually exclusive with --ref."
310	),
311	)
312	parser.add_argument(
313	"--ref", default=None,
314	help="Branch, tag, or commit SHA to search (default: HEAD). Mutually exclusive with --working-tree.",
315	)
316	parser.add_argument(
317	"--ignore-case", "-i", action="store_true", dest="ignore_case",
318	help="Case-insensitive matching.",
319	)
320	parser.add_argument(
321	"--files-only", "-l", action="store_true", dest="files_only",
322	help="Print only file paths with matches.",
323	)
324	parser.add_argument(
325	"--count", "-c", action="store_true", dest="count_mode",
326	help="Print only match counts per file.",
327	)
328	parser.add_argument(
329	"--include", default=None, dest="include_glob",
330	help="Only search files whose path matches this fnmatch glob (e.g. '*.py').",
331	)
332	parser.add_argument(
333	"--exclude", default=None, dest="exclude_glob",
334	help="Skip files whose path matches this fnmatch glob (e.g. '*.min.js').",
335	)
336	parser.add_argument(
337	"--max-matches", "-m", type=int, default=None, dest="max_matches",
338	help="Stop after this many total matches across all files.",
339	)
340	parser.add_argument(
341	"--context", "-C", type=int, default=0, dest="context_lines",
342	help="Number of surrounding lines to include with each match (like grep -C).",
343	)
344	parser.add_argument(
345	"--json", "-j", action="store_true", dest="json_out",
346	help="Emit machine-readable JSON.",
347	)
348	parser.set_defaults(func=run)
349
350	# ---------------------------------------------------------------------------
351	# Subcommand handler
352	# ---------------------------------------------------------------------------
353
354	def run(args: argparse.Namespace) -> None:
355	"""Search tracked file content for a regex pattern.
356
357	Uses Python ``re`` (ERE syntax) — metacharacters must be escaped for literal
358	matches. Without ``--working-tree`` searches the committed snapshot; with
359	``--working-tree`` reads from disk so uncommitted edits are visible. Binary
360	and non-UTF-8 files are silently skipped. Exit 0 = at least one match.
361
362	Agent quickstart
363	----------------
364	::
365
366	muse content-grep "TODO\|FIXME" --json
367	muse content-grep "TODO" --working-tree --json
368	muse content-grep "session\\.add" --files-only --json
369
370	JSON fields
371	-----------
372	pattern Pattern searched.
373	total_matches Total number of line matches.
374	total_files Number of files with at least one match.
375	matches List of match objects: ``file``, ``line`` (1-based),
376	``text`` (matched line), ``context_before``,
377	``context_after``.
378	truncated ``true`` if ``--max-matches`` was reached.
379
380	Exit codes
381	----------
382	0 At least one match found.
383	1 No matches; or invalid pattern / conflicting flags.
384	2 Not inside a Muse repository.
385	"""
386	elapsed = start_timer()
387	pattern: str = args.pattern
388	working_tree: bool = args.working_tree
389	ref: str \| None = args.ref
390	ignore_case: bool = args.ignore_case
391	files_only: bool = args.files_only
392	count_mode: bool = args.count_mode
393	include_glob: str \| None = args.include_glob
394	exclude_glob: str \| None = args.exclude_glob
395	max_matches: int \| None = args.max_matches
396	context_lines: int = max(0, args.context_lines)
397	json_out: bool = args.json_out
398
399	if working_tree and ref is not None:
400	print("❌ --working-tree and --ref are mutually exclusive.", file=sys.stderr)
401	raise SystemExit(ExitCode.USER_ERROR)
402
403	# Validate pattern BEFORE any I/O — cheap rejection of bad inputs.
404	if len(pattern) > _MAX_PATTERN_LEN:
405	print(
406	f"❌ Pattern too long ({len(pattern)} chars, max {_MAX_PATTERN_LEN}). "
407	"Use a shorter pattern or re.escape() for literal matches.",
408	file=sys.stderr,
409	)
410	raise SystemExit(ExitCode.USER_ERROR)
411
412	# Normalize BRE-style \\| to ERE-style \| so agents with grep muscle memory
413	# get the expected behaviour without needing to know the distinction.
414	pattern = pattern.replace(r"\\|", "\|")
415
416	flags = re.IGNORECASE if ignore_case else 0
417	try:
418	compiled: re.Pattern[str] = re.compile(pattern, flags)
419	except re.error as exc:
420	print(f"❌ Invalid regex: {exc}", file=sys.stderr)
421	raise SystemExit(ExitCode.USER_ERROR) from exc
422
423	root = require_repo()
424
425	# ── Working-tree mode ────────────────────────────────────────────────────
426	if working_tree:
427	disk_files = _walk_working_tree(root, include_glob, exclude_glob)
428
429	file_results: list[GrepFileResult] = []
430	total_matches = 0
431
432	for rel_path, abs_path in disk_files:
433	if max_matches is not None and total_matches >= max_matches:
434	break
435	match_count, matches = _search_disk_file(
436	abs_path, compiled, files_only, count_mode, context_lines
437	)
438	if match_count > 0:
439	if max_matches is not None:
440	remaining = max_matches - total_matches
441	if match_count > remaining:
442	matches = matches[:remaining]
443	match_count = len(matches)
444	file_results.append(
445	GrepFileResult(
446	path=rel_path,
447	object_id=None,
448	match_count=match_count,
449	matches=matches,
450	)
451	)
452	total_matches += match_count
453
454	_emit(
455	file_results=file_results,
456	total_matches=total_matches,
457	source="working-tree",
458	commit_id=None,
459	snapshot_id=None,
460	pattern=pattern,
461	json_out=json_out,
462	files_only=files_only,
463	count_mode=count_mode,
464	context_lines=context_lines,
465	elapsed=elapsed,
466	)
467	if not file_results:
468	raise SystemExit(ExitCode.USER_ERROR) # exit 1 = no matches
469	return
470
471	# ── Snapshot mode (default) ──────────────────────────────────────────────
472	branch = read_current_branch(root)
473
474	if ref is None:
475	commit_id = get_head_commit_id(root, branch)
476	if commit_id is None:
477	print("❌ No commits on current branch.", file=sys.stderr)
478	raise SystemExit(ExitCode.USER_ERROR)
479	else:
480	commit_rec = resolve_commit_ref(root, branch, ref)
481	if commit_rec is None:
482	print(f"❌ Ref '{sanitize_display(ref)}' not found.", file=sys.stderr)
483	raise SystemExit(ExitCode.USER_ERROR)
484	commit_id = commit_rec.commit_id
485
486	commit = read_commit(root, commit_id)
487	if commit is None:
488	print(f"❌ Commit {commit_id} not found.", file=sys.stderr)
489	raise SystemExit(ExitCode.INTERNAL_ERROR)
490
491	snap = read_snapshot(root, commit.snapshot_id)
492	if snap is None:
493	print(f"❌ Snapshot {commit.snapshot_id} not found.", file=sys.stderr)
494	raise SystemExit(ExitCode.INTERNAL_ERROR)
495
496	filtered: list[tuple[str, str]] = [
497	(rel_path, object_id)
498	for rel_path, object_id in sorted(snap.manifest.items())
499	if _path_matches_globs(rel_path, include_glob, exclude_glob)
500	]
501
502	def _search(item: tuple[str, str]) -> tuple[str, str, int, list[GrepMatch]]:
503	rel_path, object_id = item
504	cnt, ms = _search_object(root, object_id, compiled, files_only, count_mode, context_lines)
505	return rel_path, object_id, cnt, ms
506
507	snap_results: list[GrepFileResult] = []
508	snap_total = 0
509
510	with concurrent.futures.ThreadPoolExecutor(max_workers=_DEFAULT_MAX_WORKERS) as pool:
511	for rel_path, object_id, match_count, matches in pool.map(_search, filtered):
512	if match_count > 0:
513	if max_matches is not None:
514	remaining = max_matches - snap_total
515	if remaining <= 0:
516	break
517	if match_count > remaining:
518	matches = matches[:remaining]
519	match_count = len(matches)
520	snap_results.append(
521	GrepFileResult(
522	path=rel_path,
523	object_id=object_id,
524	match_count=match_count,
525	matches=matches,
526	)
527	)
528	snap_total += match_count
529
530	_emit(
531	file_results=snap_results,
532	total_matches=snap_total,
533	source="commit",
534	commit_id=commit_id,
535	snapshot_id=commit.snapshot_id,
536	pattern=pattern,
537	json_out=json_out,
538	files_only=files_only,
539	count_mode=count_mode,
540	context_lines=context_lines,
541	elapsed=elapsed,
542	)
543	if not snap_results:
544	raise SystemExit(ExitCode.USER_ERROR) # exit 1 = no matches
545
546	# ---------------------------------------------------------------------------
547	# Output helper (shared between modes)
548	# ---------------------------------------------------------------------------
549
550	def _emit(
551	*,
552	file_results: list[GrepFileResult],
553	total_matches: int,
554	source: str,
555	commit_id: str \| None,
556	snapshot_id: str \| None,
557	pattern: str,
558	json_out: bool,
559	files_only: bool,
560	count_mode: bool,
561	context_lines: int,
562	elapsed: Callable[[], float],
563	) -> None:
564	"""Render search results to stdout in text or JSON format."""
565	if json_out:
566	print(json.dumps(_ContentGrepJson(
567	**make_envelope(elapsed),
568	source=source,
569	commit_id=commit_id,
570	snapshot_id=snapshot_id,
571	pattern=pattern,
572	total_files_matched=len(file_results),
573	total_matches=total_matches,
574	results=file_results,
575	)))
576	else:
577	for fr in file_results:
578	safe_path = sanitize_display(fr["path"])
579	if files_only:
580	print(safe_path)
581	elif count_mode:
582	print(f"{safe_path}:{fr['match_count']}")
583	else:
584	for m in fr["matches"]:
585	if context_lines > 0:
586	for ctx in m["context_before"]:
587	print(f"{safe_path}:{m['line_number']}-{sanitize_display(ctx)}")
588	print(f"{safe_path}:{m['line_number']}:{sanitize_display(m['line'])}")
589	for ctx in m["context_after"]:
590	print(f"{safe_path}:{m['line_number']}+{sanitize_display(ctx)}")
591	else:
592	print(f"{safe_path}:{m['line_number']}:{sanitize_display(m['line'])}")

File History 7 commits

sha256:18b983389ee1b55900fcd799bfbb496552d2e3ecded9d18cefbfef188947a12e chore: remove blob-debug test marker file Sonnet 4.6 23 hours ago

sha256:e452ad9a6ace6ccc6d875a35e06caf9da5576a970c1c36133b69a891ce5fefa8 chore: prebuild timing test Sonnet 4.6 8 days ago

sha256:0008ab6695e3e064b3e236b24fd19e538fef6a588eb0d211622f4466d919c0b1 merge: pull staging/dev — advance to 0.2.0rc12 Sonnet 4.6 patch 10 days ago

sha256:9c33d61749fff814c5226d5386aa2af7064c2c02788594a25fdd709358132eea fix: _PROPOSAL_PREFIX_RESOLVE_LIMIT 200 → 100 to match hub … Sonnet 4.6 21 days ago

sha256:36c3cb3e76619d4c30a6d9bf81b5ec4ff148e30dcfed913e3114ca7b43b81c7e fix: rename objects→blobs in push client and all stale test… Sonnet 4.6 patch 24 days ago

sha256:c06a9b9b9fee26c68ea725b44d54b2c0a171301ce9de746d5b656617b4463a9a fix: repair four test failures from post-migration audit Sonnet 4.6 patch 30 days ago

sha256:1900655993c83c4107067375548a7be823e471d2515830842f1a12cba4bd3cdf fix: unified object store migration — idempotent writes, JS… Sonnet 4.6 minor ⚠ 30 days ago

class GrepMatch

class GrepFileResult

class _ContentGrepJson

function _is_binary

function _path_matches_globs

function _search_lines

function _search_object

function _search_disk_file

function _walk_working_tree

function register

function run

function _search

function _emit

Pathmuse/cli/commands/content_grep.py

Lines592

Size20.8 KB

LangPython

Refsha256:18b983389ee1b55900fcd799bfbb496552d2e3ecded9d18cefbfef188947a12e

Object ID

sha256:fb143135337ad2587266938c762e051c4ad4b0a2999e391e6c25a8e0a33cb906…

Last commit

sha256:18b983389ee1b55900fcd799bfbb496552d2e3ecded9d18cefbfef188947a12e

chore: remove blob-debug test marker file

23 hours ago

Quick links

Blame History