tests/test_cmd_content_grep_hardening.py · gabriel/muse

test_cmd_content_grep_hardening.py python

969 lines 33.8 KB

sha256:2eaa5d95f9d9383498e76947410a26e5a3ba23d182f339910c424cf88fad412b fix: try fetch/presign before fetch/mpack to avoid Cloudfla… Sonnet 4.6 patch 7 days ago

1	"""Hardening tests for ``muse content-grep``.
2
3	Covers:
4	Unit — _is_binary, _path_matches_globs, _search_object (context,
5	binary skip, utf-8 replace), pattern validation order
6	Security — ANSI injection in file paths and match text, pattern length
7	cap, invalid regex, ReDoS pattern rejected before I/O
8	Perf — parallel reads complete correctly, --max-matches cap
9	JSON — _ContentGrepJson schema (commit_id, snapshot_id, totals),
10	GrepMatch context_before/context_after fields
11	Flags — --include, --exclude, --max-matches, --context/-C, --json,
12	rejection of old --format flag
13	Integration — multi-file with mixed hits, --include narrows search,
14	--exclude skips files, --context shows surrounding lines,
15	--ref searches historical commit
16	E2E — --help output mentions all new flags
17	Stress — 500-file snapshot, concurrent parallel reads
18	"""
19
20	from __future__ import annotations
21	from collections.abc import Mapping
22
23	import datetime
24	import json
25	import pathlib
26	import threading
27	from typing import TypedDict
28
29	import pytest
30	from tests.cli_test_helper import CliRunner, InvokeResult
31
32	from muse.core.object_store import write_object
33	from muse.core.ids import hash_commit, hash_snapshot
34	from muse.core.commits import (
35	CommitRecord,
36	write_commit,
37	)
38	from muse.core.snapshots import (
39	SnapshotRecord,
40	write_snapshot,
41	)
42	from muse.core.types import Manifest, blob_id
43
44	cli = None
45	runner = CliRunner()
46	_invoke_lock = threading.Lock()
47
48	type _FilesMap = dict[str, bytes]
49
50	_REPO_ID = "cgrep-hardening"
51
52
53	# ---------------------------------------------------------------------------
54	# Helpers
55	# ---------------------------------------------------------------------------
56
57
58	class _GrepMatchOut(TypedDict):
59	line_number: int
60	line: str
61	context_before: list[str]
62	context_after: list[str]
63
64
65	class _GrepResultOut(TypedDict):
66	file: str
67	object_id: str
68	match_count: int
69	matches: list[_GrepMatchOut]
70
71
72	class _GrepOut(TypedDict):
73	source: str
74	commit_id: str
75	snapshot_id: str
76	pattern: str
77	total_files_matched: int
78	total_matches: int
79	results: list[_GrepResultOut]
80	duration_ms: float
81	exit_code: int
82
83
84
85
86	def _init_repo(path: pathlib.Path, repo_id: str = _REPO_ID) -> pathlib.Path:
87	dot_muse = muse_dir(path)
88	for d in ("commits", "snapshots", "objects", "refs/heads"):
89	(dot_muse / d).mkdir(parents=True, exist_ok=True)
90	(dot_muse / "HEAD").write_text("ref: refs/heads/main", encoding="utf-8")
91	(dot_muse / "repo.json").write_text(
92	json.dumps({"repo_id": repo_id, "domain": "midi"}), encoding="utf-8"
93	)
94	return path
95
96
97	def _env(repo: pathlib.Path) -> Manifest:
98	return {"MUSE_REPO_ROOT": str(repo)}
99
100
101	_counter = 0
102
103
104	def _commit_files(
105	root: pathlib.Path,
106	files: _FilesMap,
107	branch: str = "main",
108	parent_id: str \| None = None,
109	) -> str:
110	global _counter
111	_counter += 1
112	manifest: Manifest = {}
113	for rel_path, content in files.items():
114	obj_id = blob_id(content)
115	write_object(root, obj_id, content)
116	manifest[rel_path] = obj_id
117	snap_id = hash_snapshot(manifest)
118	write_snapshot(root, SnapshotRecord(snapshot_id=snap_id, manifest=manifest))
119	committed_at = datetime.datetime.now(datetime.timezone.utc)
120	parent_ids = [parent_id] if parent_id else []
121	commit_id = hash_commit(
122	parent_ids, snap_id, f"commit {_counter}", committed_at.isoformat(),
123	)
124	write_commit(
125	root,
126	CommitRecord(
127	commit_id=commit_id,
128	branch=branch,
129	snapshot_id=snap_id,
130	message=f"commit {_counter}",
131	committed_at=committed_at,
132	parent_commit_id=parent_id,
133	),
134	)
135	branch_ref = ref_path(root, branch)
136	branch_ref.parent.mkdir(parents=True, exist_ok=True)
137	branch_ref.write_text(commit_id, encoding="utf-8")
138	return commit_id
139
140
141	def _invoke(args: list[str], env: Manifest \| None = None) -> InvokeResult:
142	with _invoke_lock:
143	return runner.invoke(cli, args, env=env)
144
145
146	def _parse(result: InvokeResult) -> _GrepOut:
147	raw: _GrepOut = json.loads(result.output)
148	return raw
149
150
151	# ---------------------------------------------------------------------------
152	# Unit: _is_binary
153	# ---------------------------------------------------------------------------
154
155
156	def test_is_binary_null_byte() -> None:
157	from muse.cli.commands.content_grep import _is_binary
158
159	assert _is_binary(b"\x00hello") is True
160
161
162	def test_is_binary_clean_text() -> None:
163	from muse.cli.commands.content_grep import _is_binary
164
165	assert _is_binary(b"hello world\n") is False
166
167
168	def test_is_binary_empty() -> None:
169	from muse.cli.commands.content_grep import _is_binary
170
171	assert _is_binary(b"") is False
172
173
174	# ---------------------------------------------------------------------------
175	# Unit: _path_matches_globs
176	# ---------------------------------------------------------------------------
177
178
179	def test_path_matches_no_filter() -> None:
180	from muse.cli.commands.content_grep import _path_matches_globs
181
182	assert _path_matches_globs("src/main.py", None, None) is True
183
184
185	def test_path_matches_include_basename() -> None:
186	from muse.cli.commands.content_grep import _path_matches_globs
187
188	assert _path_matches_globs("src/main.py", "*.py", None) is True
189	assert _path_matches_globs("src/main.js", "*.py", None) is False
190
191
192	def test_path_matches_include_full_path() -> None:
193	from muse.cli.commands.content_grep import _path_matches_globs
194
195	assert _path_matches_globs("src/main.py", "src/*.py", None) is True
196	assert _path_matches_globs("tests/main.py", "src/*.py", None) is False
197
198
199	def test_path_matches_exclude_basename() -> None:
200	from muse.cli.commands.content_grep import _path_matches_globs
201
202	assert _path_matches_globs("app.min.js", None, "*.min.js") is False
203	assert _path_matches_globs("app.js", None, "*.min.js") is True
204
205
206	def test_path_matches_include_and_exclude() -> None:
207	from muse.cli.commands.content_grep import _path_matches_globs
208
209	assert _path_matches_globs("src/main.py", ".py", "test_.py") is True
210	assert _path_matches_globs("test_foo.py", ".py", "test_.py") is False
211
212
213	# ---------------------------------------------------------------------------
214	# Unit: _search_object — context lines
215	# ---------------------------------------------------------------------------
216
217
218	def test_search_object_context(tmp_path: pathlib.Path) -> None:
219	import re
220	from muse.cli.commands.content_grep import _search_object
221
222	_init_repo(tmp_path)
223	content = b"line one\nTARGET line\nline three\n"
224	obj_id = blob_id(content)
225	write_object(tmp_path, obj_id, content)
226
227	pat = re.compile("TARGET")
228	count, matches = _search_object(tmp_path, obj_id, pat, False, False, context_lines=1)
229	assert count == 1
230	assert len(matches) == 1
231	assert matches[0]["context_before"] == ["line one"]
232	assert matches[0]["context_after"] == ["line three"]
233
234
235	def test_search_object_context_at_boundary(tmp_path: pathlib.Path) -> None:
236	import re
237	from muse.cli.commands.content_grep import _search_object
238
239	_init_repo(tmp_path)
240	content = b"TARGET\nonly\n"
241	obj_id = blob_id(content)
242	write_object(tmp_path, obj_id, content)
243
244	pat = re.compile("TARGET")
245	count, matches = _search_object(tmp_path, obj_id, pat, False, False, context_lines=3)
246	assert matches[0]["context_before"] == []
247	assert matches[0]["context_after"] == ["only"]
248
249
250	def test_search_object_no_context(tmp_path: pathlib.Path) -> None:
251	import re
252	from muse.cli.commands.content_grep import _search_object
253
254	_init_repo(tmp_path)
255	content = b"line\nTARGET\nend\n"
256	obj_id = blob_id(content)
257	write_object(tmp_path, obj_id, content)
258
259	pat = re.compile("TARGET")
260	_, matches = _search_object(tmp_path, obj_id, pat, False, False, context_lines=0)
261	assert matches[0]["context_before"] == []
262	assert matches[0]["context_after"] == []
263
264
265	def test_search_object_binary_skipped(tmp_path: pathlib.Path) -> None:
266	import re
267	from muse.cli.commands.content_grep import _search_object
268
269	_init_repo(tmp_path)
270	content = b"\x00\x01\x02TARGET\x03"
271	obj_id = blob_id(content)
272	write_object(tmp_path, obj_id, content)
273
274	pat = re.compile("TARGET")
275	count, matches = _search_object(tmp_path, obj_id, pat, False, False, 0)
276	assert count == 0
277	assert matches == []
278
279
280	# ---------------------------------------------------------------------------
281	# Security: pattern validation happens BEFORE I/O
282	# ---------------------------------------------------------------------------
283
284
285	def test_long_pattern_rejected_before_io(tmp_path: pathlib.Path) -> None:
286	"""A too-long pattern must be rejected without touching the object store."""
287	_init_repo(tmp_path)
288	# Do NOT commit any files — if I/O happened, we'd get a 'no commits' error,
289	# not the 'pattern too long' error.
290	bad_pattern = "a" * 501
291	result = _invoke(
292	["content-grep", bad_pattern], env=_env(tmp_path)
293	)
294	assert result.exit_code != 0
295	# The error must be about pattern length, not about missing commits.
296	assert "too long" in result.output.lower() or "too long" in (result.stderr or "").lower()
297
298
299	def test_invalid_regex_rejected_before_io(tmp_path: pathlib.Path) -> None:
300	_init_repo(tmp_path)
301	result = _invoke(
302	["content-grep", "[unclosed"], env=_env(tmp_path)
303	)
304	assert result.exit_code != 0
305	assert "regex" in result.output.lower() or "regex" in (result.stderr or "").lower()
306
307
308	# ---------------------------------------------------------------------------
309	# Security: ANSI injection
310	# ---------------------------------------------------------------------------
311
312
313	def test_ansi_injection_in_path(tmp_path: pathlib.Path) -> None:
314	"""File paths with ANSI escapes must be stripped in text output."""
315	_init_repo(tmp_path)
316	ansi_path = "\x1b[31mmalicious\x1b[0m.txt"
317	_commit_files(tmp_path, {ansi_path: b"TARGET content\n"})
318	result = _invoke(
319	["content-grep", "TARGET"], env=_env(tmp_path)
320	)
321	assert result.exit_code == 0
322	assert "\x1b" not in result.output
323
324
325	def test_ansi_injection_in_match_text(tmp_path: pathlib.Path) -> None:
326	"""Match text with ANSI escapes must be stripped in text output."""
327	_init_repo(tmp_path)
328	_commit_files(tmp_path, {"safe.txt": b"TARGET \x1b[31mred\x1b[0m content\n"})
329	result = _invoke(
330	["content-grep", "TARGET"], env=_env(tmp_path)
331	)
332	assert result.exit_code == 0
333	assert "\x1b" not in result.output
334
335
336	# ---------------------------------------------------------------------------
337	# JSON schema: _ContentGrepJson
338	# ---------------------------------------------------------------------------
339
340
341	def test_json_schema_all_fields(tmp_path: pathlib.Path) -> None:
342	_init_repo(tmp_path)
343	_commit_files(tmp_path, {"a.txt": b"hello world\nhello again\n"})
344	result = _invoke(
345	["content-grep", "hello", "--json"], env=_env(tmp_path)
346	)
347	assert result.exit_code == 0
348	data = _parse(result)
349	assert data["commit_id"].startswith("sha256:")
350	assert len(data["commit_id"]) == 71
351	assert data["snapshot_id"].startswith("sha256:")
352	assert len(data["snapshot_id"]) == 71
353	assert data["pattern"] == "hello"
354	assert data["total_files_matched"] == 1
355	assert data["total_matches"] == 2
356	assert len(data["results"]) == 1
357	r = data["results"][0]
358	assert r["path"] == "a.txt"
359	assert r["match_count"] == 2
360	assert isinstance(r["matches"], list)
361
362
363	def test_json_schema_context_fields(tmp_path: pathlib.Path) -> None:
364	_init_repo(tmp_path)
365	_commit_files(tmp_path, {"c.txt": b"before\nTARGET\nafter\n"})
366	result = _invoke(
367	["content-grep", "TARGET", "--context", "1", "--json"],
368	env=_env(tmp_path),
369	)
370	assert result.exit_code == 0
371	data = _parse(result)
372	match = data["results"][0]["matches"][0]
373	assert isinstance(match, dict)
374	assert "context_before" in match
375	assert "context_after" in match
376	assert match["context_before"] == ["before"]
377	assert match["context_after"] == ["after"]
378
379
380	def test_json_schema_no_match_exit1(tmp_path: pathlib.Path) -> None:
381	_init_repo(tmp_path)
382	_commit_files(tmp_path, {"a.txt": b"hello\n"})
383	result = _invoke(
384	["content-grep", "ZZZNOMATCH", "--json"], env=_env(tmp_path)
385	)
386	assert result.exit_code != 0
387
388
389	def test_json_total_matches_multiple_files(tmp_path: pathlib.Path) -> None:
390	_init_repo(tmp_path)
391	_commit_files(tmp_path, {
392	"a.txt": b"hit\nhit\n",
393	"b.txt": b"hit\n",
394	"c.txt": b"miss\n",
395	})
396	result = _invoke(
397	["content-grep", "hit", "--json"], env=_env(tmp_path)
398	)
399	assert result.exit_code == 0
400	data = _parse(result)
401	assert data["total_files_matched"] == 2
402	assert data["total_matches"] == 3
403
404
405	# ---------------------------------------------------------------------------
406	# Flags: --include
407	# ---------------------------------------------------------------------------
408
409
410	def test_include_filters_to_py_only(tmp_path: pathlib.Path) -> None:
411	_init_repo(tmp_path)
412	_commit_files(tmp_path, {
413	"module.py": b"TARGET in python\n",
414	"module.js": b"TARGET in js\n",
415	"readme.md": b"TARGET in md\n",
416	})
417	result = _invoke(
418	["content-grep", "TARGET", "--include", "*.py", "--json"],
419	env=_env(tmp_path),
420	)
421	assert result.exit_code == 0
422	data = _parse(result)
423	assert data["total_files_matched"] == 1
424	assert data["results"][0]["path"] == "module.py"
425
426
427	def test_include_no_matches_after_filter(tmp_path: pathlib.Path) -> None:
428	_init_repo(tmp_path)
429	_commit_files(tmp_path, {"module.js": b"TARGET here\n"})
430	result = _invoke(
431	["content-grep", "TARGET", "--include", "*.py"],
432	env=_env(tmp_path),
433	)
434	assert result.exit_code != 0 # no files pass include filter
435
436
437	# ---------------------------------------------------------------------------
438	# Flags: --exclude
439	# ---------------------------------------------------------------------------
440
441
442	def test_exclude_skips_minified(tmp_path: pathlib.Path) -> None:
443	_init_repo(tmp_path)
444	_commit_files(tmp_path, {
445	"app.js": b"TARGET here\n",
446	"app.min.js": b"TARGET minified\n",
447	})
448	result = _invoke(
449	["content-grep", "TARGET", "--exclude", "*.min.js", "--json"],
450	env=_env(tmp_path),
451	)
452	assert result.exit_code == 0
453	data = _parse(result)
454	assert data["total_files_matched"] == 1
455	assert data["results"][0]["path"] == "app.js"
456
457
458	def test_exclude_all_results_in_no_match(tmp_path: pathlib.Path) -> None:
459	_init_repo(tmp_path)
460	_commit_files(tmp_path, {"test.py": b"TARGET\n"})
461	result = _invoke(
462	["content-grep", "TARGET", "--exclude", "test_*.py"],
463	env=_env(tmp_path),
464	)
465	# test.py doesn't match test_*.py exclude pattern, so it should match.
466	# Verify this works (target file isn't excluded).
467	assert result.exit_code == 0
468
469
470	# ---------------------------------------------------------------------------
471	# Flags: --max-matches
472	# ---------------------------------------------------------------------------
473
474
475	def test_max_matches_caps_output(tmp_path: pathlib.Path) -> None:
476	_init_repo(tmp_path)
477	_commit_files(tmp_path, {"many.txt": b"hit\n" * 100})
478	result = _invoke(
479	["content-grep", "hit", "--max-matches", "10", "--json"],
480	env=_env(tmp_path),
481	)
482	assert result.exit_code == 0
483	data = _parse(result)
484	assert data["total_matches"] <= 10
485
486
487	def test_max_matches_zero_still_exits_nonzero_on_cap(tmp_path: pathlib.Path) -> None:
488	"""When max_matches=0, no results are kept — exit 1."""
489	_init_repo(tmp_path)
490	_commit_files(tmp_path, {"a.txt": b"hit\n"})
491	result = _invoke(
492	["content-grep", "hit", "--max-matches", "0", "--json"],
493	env=_env(tmp_path),
494	)
495	assert result.exit_code != 0 # no results after cap → exit 1
496
497
498	# ---------------------------------------------------------------------------
499	# Flags: --context / -C
500	# ---------------------------------------------------------------------------
501
502
503	def test_context_text_output(tmp_path: pathlib.Path) -> None:
504	_init_repo(tmp_path)
505	_commit_files(tmp_path, {"ctx.txt": b"alpha\nbeta\ngamma\n"})
506	result = _invoke(
507	["content-grep", "beta", "--context", "1"],
508	env=_env(tmp_path),
509	)
510	assert result.exit_code == 0
511	# Context before and after should appear in output.
512	assert "alpha" in result.output
513	assert "gamma" in result.output
514
515
516	def test_context_short_flag(tmp_path: pathlib.Path) -> None:
517	_init_repo(tmp_path)
518	_commit_files(tmp_path, {"ctx2.txt": b"first\nTARGET\nlast\n"})
519	result = _invoke(
520	["content-grep", "TARGET", "-C", "1"],
521	env=_env(tmp_path),
522	)
523	assert result.exit_code == 0
524	assert "first" in result.output
525	assert "last" in result.output
526
527
528	# ---------------------------------------------------------------------------
529	# Flags: --json boolean (rejects old --format)
530	# ---------------------------------------------------------------------------
531
532
533	def test_format_flag_rejected(tmp_path: pathlib.Path) -> None:
534	"""Old ``--format json`` must be rejected by argparse (exit 2)."""
535	_init_repo(tmp_path)
536	_commit_files(tmp_path, {"a.txt": b"hello\n"})
537	result = _invoke(
538	["content-grep", "hello", "--format", "json"],
539	env=_env(tmp_path),
540	)
541	assert result.exit_code == 2
542
543
544	# ---------------------------------------------------------------------------
545	# Integration: --ref searches a different commit
546	# ---------------------------------------------------------------------------
547
548
549	def test_ref_searches_branch(tmp_path: pathlib.Path) -> None:
550	_init_repo(tmp_path)
551	c1 = _commit_files(tmp_path, {"v1.txt": b"OLD content\n"})
552	_commit_files(tmp_path, {"v2.txt": b"NEW content\n"}, parent_id=c1)
553
554	# Search HEAD — should find NEW in v2.txt.
555	result_head = _invoke(
556	["content-grep", "NEW", "--json"], env=_env(tmp_path)
557	)
558	assert result_head.exit_code == 0
559	data = _parse(result_head)
560	paths = [r["path"] for r in data["results"]]
561	assert "v2.txt" in paths
562
563	# Search the first commit by ID — should find OLD in v1.txt, not NEW.
564	result_ref = _invoke(
565	["content-grep", "OLD", "--ref", c1, "--json"],
566	env=_env(tmp_path),
567	)
568	assert result_ref.exit_code == 0
569	data_ref = _parse(result_ref)
570	paths_ref = [r["path"] for r in data_ref["results"]]
571	assert "v1.txt" in paths_ref
572	assert data_ref["commit_id"] == c1
573
574
575	# ---------------------------------------------------------------------------
576	# E2E: --help mentions all new flags
577	# ---------------------------------------------------------------------------
578
579
580	def test_help_mentions_include() -> None:
581	result = _invoke(["content-grep", "--help"])
582	assert result.exit_code == 0
583	assert "--include" in result.output
584
585
586	def test_help_mentions_exclude() -> None:
587	result = _invoke(["content-grep", "--help"])
588	assert "--exclude" in result.output
589
590
591	def test_help_mentions_max_matches() -> None:
592	result = _invoke(["content-grep", "--help"])
593	assert "--max-matches" in result.output
594
595
596	def test_help_mentions_context() -> None:
597	result = _invoke(["content-grep", "--help"])
598	assert "--context" in result.output or "-C" in result.output
599
600
601	def test_help_mentions_json_not_format() -> None:
602	result = _invoke(["content-grep", "--help"])
603	assert "--json" in result.output
604	assert "--format" not in result.output
605
606
607	# ---------------------------------------------------------------------------
608	# Stress: 500-file snapshot, pattern matches 250
609	# ---------------------------------------------------------------------------
610
611
612	def test_stress_500_files(tmp_path: pathlib.Path) -> None:
613	_init_repo(tmp_path)
614	files: _FilesMap = {}
615	for i in range(500):
616	content = b"TARGET_STRESS\n" if i % 2 == 0 else b"other\n"
617	files[f"f_{i:04d}.txt"] = content
618	_commit_files(tmp_path, files)
619	result = _invoke(
620	["content-grep", "TARGET_STRESS", "--json"],
621	env=_env(tmp_path),
622	)
623	assert result.exit_code == 0
624	data = _parse(result)
625	assert data["total_files_matched"] == 250
626	assert data["total_matches"] == 250
627
628
629	# ---------------------------------------------------------------------------
630	# Stress: concurrent reads
631	# ---------------------------------------------------------------------------
632
633
634	def test_stress_concurrent_reads(tmp_path: pathlib.Path) -> None:
635	_init_repo(tmp_path)
636	_commit_files(tmp_path, {"concurrent.txt": b"CONCURRENT TARGET\n"})
637
638	errors: list[str] = []
639
640	def _read() -> None:
641	r = _invoke(
642	["content-grep", "CONCURRENT", "--json"],
643	env=_env(tmp_path),
644	)
645	if r.exit_code != 0:
646	errors.append(f"exit {r.exit_code}")
647	else:
648	try:
649	d = json.loads(r.output)
650	if d.get("total_matches", 0) != 1:
651	errors.append(f"unexpected total_matches: {d.get('total_matches')}")
652	except json.JSONDecodeError as exc:
653	errors.append(str(exc))
654
655	threads = [threading.Thread(target=_read) for _ in range(8)]
656	for t in threads:
657	t.start()
658	for t in threads:
659	t.join()
660
661	assert not errors, f"Concurrent read failures: {errors}"
662
663
664	# ---------------------------------------------------------------------------
665	# JSON schema: complete key set (TestJsonSchemaComplete)
666	# ---------------------------------------------------------------------------
667
668
669	_REQUIRED_KEYS = frozenset({
670	"source",
671	"commit_id",
672	"snapshot_id",
673	"pattern",
674	"total_files_matched",
675	"total_matches",
676	"results",
677	"duration_ms",
678	"exit_code",
679	})
680
681
682	class TestJsonSchemaComplete:
683	"""Verify that every required key is present in JSON output."""
684
685	def test_all_required_keys_present_commit_mode(self, tmp_path: pathlib.Path) -> None:
686	_init_repo(tmp_path)
687	_commit_files(tmp_path, {"a.txt": b"hello\n"})
688	result = _invoke(["content-grep", "hello", "--json"], env=_env(tmp_path))
689	assert result.exit_code == 0
690	data = json.loads(result.output)
691	missing = _REQUIRED_KEYS - data.keys()
692	assert not missing, f"Missing keys: {missing}"
693
694	def test_all_required_keys_present_working_tree_mode(self, tmp_path: pathlib.Path) -> None:
695	_init_repo(tmp_path)
696	_commit_files(tmp_path, {"a.txt": b"hello\n"})
697	# Also write a matching file to disk so working-tree search finds it.
698	(tmp_path / "a.txt").write_bytes(b"hello\n")
699	result = _invoke(
700	["content-grep", "hello", "--working-tree", "--json"],
701	env=_env(tmp_path),
702	)
703	assert result.exit_code == 0
704	data = json.loads(result.output)
705	missing = _REQUIRED_KEYS - data.keys()
706	assert not missing, f"Missing keys: {missing}"
707
708	def test_source_field_is_commit(self, tmp_path: pathlib.Path) -> None:
709	_init_repo(tmp_path)
710	_commit_files(tmp_path, {"a.txt": b"hello\n"})
711	result = _invoke(["content-grep", "hello", "--json"], env=_env(tmp_path))
712	data = json.loads(result.output)
713	assert data["source"] == "commit"
714
715	def test_source_field_is_working_tree(self, tmp_path: pathlib.Path) -> None:
716	_init_repo(tmp_path)
717	_commit_files(tmp_path, {"a.txt": b"hello\n"})
718	(tmp_path / "a.txt").write_bytes(b"hello\n")
719	result = _invoke(
720	["content-grep", "hello", "--working-tree", "--json"],
721	env=_env(tmp_path),
722	)
723	data = json.loads(result.output)
724	assert data["source"] == "working-tree"
725
726	def test_commit_id_null_in_working_tree_mode(self, tmp_path: pathlib.Path) -> None:
727	_init_repo(tmp_path)
728	_commit_files(tmp_path, {"a.txt": b"hello\n"})
729	(tmp_path / "a.txt").write_bytes(b"hello\n")
730	result = _invoke(
731	["content-grep", "hello", "--working-tree", "--json"],
732	env=_env(tmp_path),
733	)
734	data = json.loads(result.output)
735	assert data["commit_id"] is None
736
737	def test_snapshot_id_null_in_working_tree_mode(self, tmp_path: pathlib.Path) -> None:
738	_init_repo(tmp_path)
739	_commit_files(tmp_path, {"a.txt": b"hello\n"})
740	(tmp_path / "a.txt").write_bytes(b"hello\n")
741	result = _invoke(
742	["content-grep", "hello", "--working-tree", "--json"],
743	env=_env(tmp_path),
744	)
745	data = json.loads(result.output)
746	assert data["snapshot_id"] is None
747
748	def test_exit_code_field_zero_on_match(self, tmp_path: pathlib.Path) -> None:
749	_init_repo(tmp_path)
750	_commit_files(tmp_path, {"a.txt": b"hello\n"})
751	result = _invoke(["content-grep", "hello", "--json"], env=_env(tmp_path))
752	data = json.loads(result.output)
753	assert data["exit_code"] == 0
754
755	def test_json_is_compact(self, tmp_path: pathlib.Path) -> None:
756	"""JSON output must be a single line — no pretty-printing."""
757	_init_repo(tmp_path)
758	_commit_files(tmp_path, {"a.txt": b"hello\n"})
759	result = _invoke(["content-grep", "hello", "--json"], env=_env(tmp_path))
760	lines = [ln for ln in result.output.splitlines() if ln.strip()]
761	assert len(lines) == 1, "JSON must be compact (one line)"
762
763
764	# ---------------------------------------------------------------------------
765	# duration_ms (TestElapsedSeconds)
766	# ---------------------------------------------------------------------------
767
768
769	class TestElapsedSeconds:
770	"""``duration_ms`` must be a non-negative float in all JSON paths."""
771
772	def _assert_elapsed(self, data: Mapping[str, object]) -> None: # type: ignore[type-arg]
773	assert "duration_ms" in data
774	assert isinstance(data["duration_ms"], float)
775	assert data["duration_ms"] >= 0.0
776
777	def test_elapsed_present_commit_mode(self, tmp_path: pathlib.Path) -> None:
778	_init_repo(tmp_path)
779	_commit_files(tmp_path, {"a.txt": b"target\n"})
780	result = _invoke(["content-grep", "target", "--json"], env=_env(tmp_path))
781	self._assert_elapsed(json.loads(result.output))
782
783	def test_elapsed_present_working_tree_mode(self, tmp_path: pathlib.Path) -> None:
784	_init_repo(tmp_path)
785	_commit_files(tmp_path, {"a.txt": b"target\n"})
786	(tmp_path / "a.txt").write_bytes(b"target\n")
787	result = _invoke(
788	["content-grep", "target", "--working-tree", "--json"],
789	env=_env(tmp_path),
790	)
791	self._assert_elapsed(json.loads(result.output))
792
793	def test_elapsed_is_float_not_int(self, tmp_path: pathlib.Path) -> None:
794	_init_repo(tmp_path)
795	_commit_files(tmp_path, {"a.txt": b"target\n"})
796	result = _invoke(["content-grep", "target", "--json"], env=_env(tmp_path))
797	data = json.loads(result.output)
798	assert isinstance(data["duration_ms"], float)
799
800	def test_elapsed_reasonable_upper_bound(self, tmp_path: pathlib.Path) -> None:
801	"""Single-file search in a temp repo should be well under 5 seconds."""
802	_init_repo(tmp_path)
803	_commit_files(tmp_path, {"a.txt": b"target\n"})
804	result = _invoke(["content-grep", "target", "--json"], env=_env(tmp_path))
805	data = json.loads(result.output)
806	assert data["duration_ms"] < 5.0
807
808	def test_elapsed_present_stress_mode(self, tmp_path: pathlib.Path) -> None:
809	"""duration_ms must appear even for 500-file parallel searches."""
810	_init_repo(tmp_path)
811	files: Mapping[str, bytes] = {f"f{i}.txt": b"needle\n" for i in range(50)}
812	_commit_files(tmp_path, files)
813	result = _invoke(["content-grep", "needle", "--json"], env=_env(tmp_path))
814	assert result.exit_code == 0
815	self._assert_elapsed(json.loads(result.output))
816
817	def test_elapsed_six_decimal_places(self, tmp_path: pathlib.Path) -> None:
818	"""duration_ms should be rounded to at most 6 decimal places."""
819	_init_repo(tmp_path)
820	_commit_files(tmp_path, {"a.txt": b"target\n"})
821	result = _invoke(["content-grep", "target", "--json"], env=_env(tmp_path))
822	data = json.loads(result.output)
823	elapsed = data["duration_ms"]
824	# round-trip through 6-decimal representation must be exact
825	assert round(elapsed, 6) == elapsed
826
827
828	# ---------------------------------------------------------------------------
829	# exit_code field (TestExitCode)
830	# ---------------------------------------------------------------------------
831
832
833	class TestExitCode:
834	"""``exit_code`` in JSON must mirror the process exit code."""
835
836	def test_exit_code_zero_on_match(self, tmp_path: pathlib.Path) -> None:
837	_init_repo(tmp_path)
838	_commit_files(tmp_path, {"a.txt": b"hit\n"})
839	result = _invoke(["content-grep", "hit", "--json"], env=_env(tmp_path))
840	assert result.exit_code == 0
841	assert json.loads(result.output)["exit_code"] == 0
842
843	def test_exit_code_zero_working_tree_match(self, tmp_path: pathlib.Path) -> None:
844	_init_repo(tmp_path)
845	_commit_files(tmp_path, {"a.txt": b"hit\n"})
846	(tmp_path / "a.txt").write_bytes(b"hit\n")
847	result = _invoke(
848	["content-grep", "hit", "--working-tree", "--json"],
849	env=_env(tmp_path),
850	)
851	assert result.exit_code == 0
852	assert json.loads(result.output)["exit_code"] == 0
853
854	def test_exit_code_is_integer(self, tmp_path: pathlib.Path) -> None:
855	_init_repo(tmp_path)
856	_commit_files(tmp_path, {"a.txt": b"hit\n"})
857	result = _invoke(["content-grep", "hit", "--json"], env=_env(tmp_path))
858	data = json.loads(result.output)
859	assert isinstance(data["exit_code"], int)
860
861	def test_exit_code_in_json_matches_process_exit(self, tmp_path: pathlib.Path) -> None:
862	"""JSON exit_code must equal the actual process exit code."""
863	_init_repo(tmp_path)
864	_commit_files(tmp_path, {"a.txt": b"hit\n"})
865	result = _invoke(["content-grep", "hit", "--json"], env=_env(tmp_path))
866	data = json.loads(result.output)
867	assert data["exit_code"] == result.exit_code
868
869	def test_exit_code_multiple_files(self, tmp_path: pathlib.Path) -> None:
870	_init_repo(tmp_path)
871	_commit_files(tmp_path, {"a.txt": b"hit\n", "b.txt": b"hit\n"})
872	result = _invoke(["content-grep", "hit", "--json"], env=_env(tmp_path))
873	assert result.exit_code == 0
874	assert json.loads(result.output)["exit_code"] == 0
875
876
877	# ---------------------------------------------------------------------------
878	# Flag registration tests
879	# ---------------------------------------------------------------------------
880
881	import argparse as _argparse
882	from muse.cli.commands.content_grep import register as _register_content_grep
883	from muse.core.paths import muse_dir, ref_path
884
885
886	def _parse_cgrep(*args: str) -> _argparse.Namespace:
887	root_p = _argparse.ArgumentParser()
888	subs = root_p.add_subparsers(dest="cmd")
889	_register_content_grep(subs)
890	return root_p.parse_args(["content-grep", *args])
891
892
893	class TestRegisterFlags:
894	def test_default_json_out_is_false(self) -> None:
895	ns = _parse_cgrep("TODO")
896	assert ns.json_out is False
897
898	def test_json_flag_sets_json_out(self) -> None:
899	ns = _parse_cgrep("TODO", "--json")
900	assert ns.json_out is True
901
902	def test_j_shorthand_sets_json_out(self) -> None:
903	ns = _parse_cgrep("TODO", "-j")
904	assert ns.json_out is True
905
906	def test_pattern_positional(self) -> None:
907	ns = _parse_cgrep("FIXME")
908	assert ns.pattern == "FIXME"
909
910
911	# ---------------------------------------------------------------------------
912	# JSON key ergonomics: results[].file and matches[].line
913	# ---------------------------------------------------------------------------
914
915
916	class TestJsonKeyErgonomics:
917	"""content-grep --json must use 'path' (matching all other muse commands) and
918	'line' (not 'text') for match content."""
919
920	def test_result_key_is_path(self, tmp_path: pathlib.Path) -> None:
921	_init_repo(tmp_path)
922	_commit_files(tmp_path, {"src/main.py": b"hello world\n"})
923	result = _invoke(["content-grep", "hello", "--json"], env=_env(tmp_path))
924	data = json.loads(result.output)
925	assert data["results"][0]["path"] == "src/main.py"
926
927	def test_result_has_no_file_key(self, tmp_path: pathlib.Path) -> None:
928	_init_repo(tmp_path)
929	_commit_files(tmp_path, {"src/main.py": b"hello world\n"})
930	result = _invoke(["content-grep", "hello", "--json"], env=_env(tmp_path))
931	data = json.loads(result.output)
932	assert "file" not in data["results"][0]
933
934	def test_match_key_is_line_not_text(self, tmp_path: pathlib.Path) -> None:
935	_init_repo(tmp_path)
936	_commit_files(tmp_path, {"a.py": b"hello world\n"})
937	result = _invoke(["content-grep", "hello", "--json"], env=_env(tmp_path))
938	data = json.loads(result.output)
939	match = data["results"][0]["matches"][0]
940	assert match["line"] == "hello world"
941
942	def test_match_has_no_text_key(self, tmp_path: pathlib.Path) -> None:
943	_init_repo(tmp_path)
944	_commit_files(tmp_path, {"a.py": b"hello world\n"})
945	result = _invoke(["content-grep", "hello", "--json"], env=_env(tmp_path))
946	data = json.loads(result.output)
947	match = data["results"][0]["matches"][0]
948	assert "text" not in match
949
950	def test_working_tree_result_key_is_path(self, tmp_path: pathlib.Path) -> None:
951	_init_repo(tmp_path)
952	_commit_files(tmp_path, {"a.py": b"placeholder\n"})
953	(tmp_path / "a.py").write_text("needle here\n", encoding="utf-8")
954	result = _invoke(
955	["content-grep", "needle", "--working-tree", "--json"], env=_env(tmp_path)
956	)
957	data = json.loads(result.output)
958	assert data["results"][0]["path"] == "a.py"
959
960	def test_working_tree_match_key_is_line(self, tmp_path: pathlib.Path) -> None:
961	_init_repo(tmp_path)
962	_commit_files(tmp_path, {"a.py": b"placeholder\n"})
963	(tmp_path / "a.py").write_text("needle here\n", encoding="utf-8")
964	result = _invoke(
965	["content-grep", "needle", "--working-tree", "--json"], env=_env(tmp_path)
966	)
967	data = json.loads(result.output)
968	match = data["results"][0]["matches"][0]
969	assert match["line"] == "needle here"

File History 1 commit

sha256:2eaa5d95f9d9383498e76947410a26e5a3ba23d182f339910c424cf88fad412b fix: try fetch/presign before fetch/mpack to avoid Cloudfla… Sonnet 4.6 patch 7 days ago

class _GrepMatchOut

class _GrepResultOut

class _GrepOut

function _init_repo

function _env

function _commit_files

function _invoke

function _parse

function test_is_binary_null_byte

function test_is_binary_clean_text

function test_is_binary_empty

function test_path_matches_no_filter

function test_path_matches_include_basename

function test_path_matches_include_full_path

function test_path_matches_exclude_basename

function test_path_matches_include_and_exclude

function test_search_object_context

function test_search_object_context_at_boundary

function test_search_object_no_context

function test_search_object_binary_skipped

function test_long_pattern_rejected_before_io

function test_invalid_regex_rejected_before_io

function test_ansi_injection_in_path

function test_ansi_injection_in_match_text

function test_json_schema_all_fields

function test_json_schema_context_fields

function test_json_schema_no_match_exit1

function test_json_total_matches_multiple_files

function test_include_filters_to_py_only

function test_include_no_matches_after_filter

function test_exclude_skips_minified

function test_exclude_all_results_in_no_match

function test_max_matches_caps_output

function test_max_matches_zero_still_exits_nonzero_on_cap

function test_context_text_output

function test_context_short_flag

function test_format_flag_rejected

function test_ref_searches_branch

function test_help_mentions_include

function test_help_mentions_exclude

function test_help_mentions_max_matches

function test_help_mentions_context

function test_help_mentions_json_not_format

function test_stress_500_files

function test_stress_concurrent_reads

function _read

class TestJsonSchemaComplete

function test_all_required_keys_present_commit_mode

function test_all_required_keys_present_working_tree_mode

function test_source_field_is_commit

function test_source_field_is_working_tree

function test_commit_id_null_in_working_tree_mode

function test_snapshot_id_null_in_working_tree_mode

function test_exit_code_field_zero_on_match

function test_json_is_compact

class TestElapsedSeconds

function _assert_elapsed

function test_elapsed_present_commit_mode

function test_elapsed_present_working_tree_mode

function test_elapsed_is_float_not_int

function test_elapsed_reasonable_upper_bound

function test_elapsed_present_stress_mode

function test_elapsed_six_decimal_places

class TestExitCode

function test_exit_code_zero_on_match

function test_exit_code_zero_working_tree_match

function test_exit_code_is_integer

function test_exit_code_in_json_matches_process_exit

function test_exit_code_multiple_files

function _parse_cgrep

class TestRegisterFlags

function test_default_json_out_is_false

function test_json_flag_sets_json_out

function test_j_shorthand_sets_json_out

function test_pattern_positional

class TestJsonKeyErgonomics

function test_result_key_is_path

function test_result_has_no_file_key

function test_match_key_is_line_not_text

function test_match_has_no_text_key

function test_working_tree_result_key_is_path

function test_working_tree_match_key_is_line

Pathtests/test_cmd_content_grep_hardening.py

Lines969

Size33.8 KB

LangPython

Refsha256:2eaa5d95f9d9383498e76947410a26e5a3ba23d182f339910c424cf88fad412b

Object ID

sha256:0c2111c7b680db212b293a29a1b584b855b910fd2b102097540f13b8cfc82646…

Last commit

sha256:2eaa5d95f9d9383498e76947410a26e5a3ba23d182f339910c424cf88fad412b

fix: try fetch/presign before fetch/mpack to avoi…

7 days ago

Quick links

Blame History