gabriel / muse public
query.py python
632 lines 22.4 KB
Raw
sha256:1900655993c83c4107067375548a7be823e471d2515830842f1a12cba4bd3cdf fix: unified object store migration — idempotent writes, JS… Sonnet 4.6 minor ⚠ breaking 28 days ago
1 """muse code query — symbol graph predicate query (v2).
2
3 SQL for your codebase. A full predicate DSL over the typed, content-addressed
4 symbol graph — with OR, NOT, grouping, and an expanded field set.
5
6 v2 grammar::
7
8 expr = or_expr
9 or_expr = and_expr ( OR and_expr )*
10 and_expr = not_expr ( [AND] not_expr )* # implicit AND
11 not_expr = NOT primary | primary
12 primary = "(" expr ")" | atom
13 atom = KEY OP VALUE
14
15 Supported operators::
16
17 = exact match
18 ~= contains (case-insensitive)
19 ^= starts with (case-insensitive)
20 $= ends with (case-insensitive)
21 != not equal
22
23 Supported keys::
24
25 kind function | class | method | variable | import | …
26 language Python | Go | Rust | TypeScript | …
27 name bare symbol name
28 qualified_name dotted name (User.save)
29 file file path
30 hash content_id prefix (exact-body match)
31 body_hash body_hash prefix
32 signature_id signature_id prefix
33 lineno_gt symbol starts after line N
34 lineno_lt symbol starts before line N
35 size_gt symbol body exceeds N lines (end_lineno − lineno > N)
36 size_lt symbol body shorter than N lines
37
38 Usage::
39
40 muse code query "kind=function" "language=Python" "name~=validate"
41 muse code query "(kind=function OR kind=method) name^=_"
42 muse code query "NOT kind=import" "file~=billing"
43 muse code query "hash=a3f2c9"
44 muse code query "kind=function" "name$=_test" --commit HEAD~10
45 muse code query "kind=function" "name~=validate" --all-commits
46 muse code query "kind=function" "size_gt=50" --sort size # biggest fns
47 muse code query "kind=function" "name~=compute" --count # just the count
48 muse code query "kind=function" --unique-bodies # find clones
49 muse code query "kind=function" --all-commits --since 2026-01-01 # added this year
50 """
51
52 import argparse
53 import collections.abc
54 import datetime
55 import json
56 import logging
57 import pathlib
58 import sys
59 from typing import TypedDict
60
61 from muse.core.types import short_id
62 from muse.core.envelope import EnvelopeJson, make_envelope
63 from muse.core.errors import ExitCode
64 from muse.core.repo import parse_date_arg, require_repo
65 from muse.core.store import (
66 CommitRecord,
67 get_all_commits,
68 get_commit_snapshot_manifest,
69 read_current_branch,
70 resolve_commit_ref,
71 )
72 from muse.core.symbol_cache import SymbolCache, load_symbol_cache
73 from muse.core.timing import start_timer
74 from muse.plugins.code._predicate import Predicate, PredicateError, parse_query
75 from muse.plugins.code._query import language_of, symbols_for_snapshot
76 from muse.plugins.code.ast_parser import SymbolRecord
77 from muse.core.validation import clamp_int, sanitize_display
78
79 type _QueryResult = dict[str, str | int | bool]
80
81 class _QueryJson(EnvelopeJson):
82 """Formal schema for the ``muse code query --json`` output envelope (single-snapshot mode).
83
84 All fields are always present.
85
86 Fields
87 ------
88 commit: Short commit ID of the snapshot that was queried.
89 sort: Sort field used (``file``, ``name``, ``kind``, ``lineno``, ``size``).
90 unique_bodies: Whether ``--unique-bodies`` was active.
91 truncated: ``true`` when ``--limit`` was applied and results were cut.
92 results: List of matching symbol records.
93 """
94
95 commit: str
96 sort: str
97 unique_bodies: bool
98 truncated: bool
99 results: list[_QueryResult]
100
101 class _AllCommitsJson(EnvelopeJson):
102 """Formal schema for the ``muse code query --all-commits --json`` output envelope.
103
104 All fields are always present.
105
106 Fields
107 ------
108 mode: Always ``"all-commits"`` in this mode.
109 truncated: ``true`` when ``--max-commits`` was hit.
110 results: List of historical symbol match records.
111 """
112
113 mode: str
114 truncated: bool
115 results: list[_QueryResult]
116
117 type _StrMap = dict[str, str]
118 type _IconMap = dict[str, str]
119 logger = logging.getLogger(__name__)
120
121 _KIND_ICON: _IconMap = {
122 "function": "fn",
123 "async_function": "fn~",
124 "class": "class",
125 "method": "method",
126 "async_method": "method~",
127 "variable": "var",
128 "import": "import",
129 }
130
131 _VALID_SORT_FIELDS = frozenset({"file", "name", "kind", "lineno", "size"})
132
133 class _HistoricalMatch:
134 """A symbol match found in a historical commit (--all-commits mode)."""
135
136 def __init__(
137 self,
138 address: str,
139 rec: SymbolRecord,
140 commit: CommitRecord,
141 first_seen: bool,
142 ) -> None:
143 self.address = address
144 self.rec = rec
145 self.commit = commit
146 self.first_seen = first_seen
147
148 def to_dict(self) -> _QueryResult:
149 return {
150 "address": self.address,
151 "kind": self.rec["kind"],
152 "name": self.rec["name"],
153 "content_id": self.rec["content_id"],
154 "first_seen": self.first_seen,
155 "commit_id": self.commit.commit_id,
156 "commit_message": self.commit.message,
157 "committed_at": self.commit.committed_at.isoformat(),
158 "branch": self.commit.branch,
159 }
160
161 def _query_all_commits(
162 root: pathlib.Path,
163 filters: list[Predicate],
164 max_commits: int,
165 since: datetime.date | None,
166 until: datetime.date | None,
167 ) -> tuple[list[_HistoricalMatch], bool]:
168 """Walk every commit oldest-first, apply predicates against each snapshot.
169
170 Shares one ``SymbolCache`` instance across all snapshot loads so the cache
171 is read from disk exactly once and written back at most once — instead of
172 once per snapshot. On a warm cache this reduces wall time from O(n×200ms)
173 to O(1×load + n×dict_lookup).
174
175 Deduplicates on ``snapshot_id`` — commits sharing a snapshot (e.g. merge
176 commits with no file changes) are processed exactly once.
177
178 Returns:
179 ``(matches, truncated)`` — ``truncated`` is True when the walk was
180 capped at ``max_commits``.
181 """
182 all_commits = get_all_commits(root)
183 if not all_commits:
184 return [], False
185
186 sorted_commits = sorted(all_commits, key=lambda c: c.committed_at)
187
188 # Apply date filters early to avoid unnecessary snapshot loading.
189 if since is not None:
190 sorted_commits = [
191 c for c in sorted_commits if c.committed_at.date() >= since
192 ]
193 if until is not None:
194 sorted_commits = [
195 c for c in sorted_commits if c.committed_at.date() <= until
196 ]
197
198 truncated = len(sorted_commits) > max_commits
199 sorted_commits = sorted_commits[:max_commits]
200
201 results: list[_HistoricalMatch] = []
202 first_seen_map: _StrMap = {}
203 seen_snapshots: set[str] = set()
204
205 # Load the symbol cache once; share it across all snapshot iterations.
206 shared_cache: SymbolCache = load_symbol_cache(root)
207
208 try:
209 for commit in sorted_commits:
210 # Skip commits whose snapshot was already processed.
211 if commit.snapshot_id in seen_snapshots:
212 continue
213 seen_snapshots.add(commit.snapshot_id)
214
215 manifest = get_commit_snapshot_manifest(root, commit.commit_id) or {}
216 if not manifest:
217 continue
218
219 symbol_map = symbols_for_snapshot(root, manifest, cache=shared_cache)
220 for file_path, tree in sorted(symbol_map.items()):
221 for addr, rec in sorted(tree.items(), key=lambda kv: kv[1]["lineno"]):
222 if not all(f(file_path, rec) for f in filters):
223 continue
224 cid = rec["content_id"]
225 is_first = cid not in first_seen_map
226 if is_first:
227 first_seen_map[cid] = commit.commit_id
228 results.append(_HistoricalMatch(addr, rec, commit, is_first))
229 finally:
230 # Persist any newly parsed entries even if we exit early.
231 shared_cache.save()
232
233 return results, truncated
234
235 _SortTuple = tuple[str, str, SymbolRecord]
236
237 def _sort_key(sort_by: str) -> collections.abc.Callable[[_SortTuple], tuple[str | int, ...]]:
238 """Return a sort key function for a list of ``(file_path, addr, rec)`` tuples."""
239 if sort_by == "name":
240 return lambda t: (t[2]["name"].lower(), t[0], t[2]["lineno"])
241 if sort_by == "kind":
242 return lambda t: (t[2]["kind"], t[0], t[2]["lineno"])
243 if sort_by == "lineno":
244 return lambda t: (t[2]["lineno"], t[0])
245 if sort_by == "size":
246 # Negate size so largest comes first.
247 return lambda t: (-(t[2]["end_lineno"] - t[2]["lineno"]), t[0])
248 # Default: file then lineno.
249 return lambda t: (t[0], t[2]["lineno"])
250
251 def register(
252 subparsers: "argparse._SubParsersAction[argparse.ArgumentParser]",
253 ) -> None:
254 """Register ``query`` as a subcommand of ``muse code``.
255
256 Adds the following arguments:
257
258 - ``PREDICATE`` (positional, one or more) — predicate expressions in the v2 DSL.
259 - ``--commit`` / ``-c`` REF — query a historical snapshot instead of HEAD.
260 - ``--all-commits`` — search across every commit on every branch.
261 - ``--since`` / ``--until`` YYYY-MM-DD — date range for ``--all-commits``.
262 - ``--max-commits`` N — cap for ``--all-commits`` walk (default 10 000).
263 - ``--limit`` N — cap total result count (0 = unlimited).
264 - ``--sort`` FIELD — sort by file, name, kind, lineno, or size.
265 - ``--count`` — print only the result count.
266 - ``--unique-bodies`` — deduplicate by content_id (clone-detector mode).
267 - ``--hashes`` — include content hashes in human-readable output.
268 - ``--committed`` — query the last committed snapshot only; skip working-tree overlay.
269 - ``--json`` / ``-j`` — emit structured JSON (``_QueryJson`` or ``_AllCommitsJson``).
270 """
271 parser = subparsers.add_parser(
272 "query",
273 help="Query the symbol graph with a predicate DSL.",
274 description=__doc__,
275 formatter_class=argparse.RawDescriptionHelpFormatter,
276 )
277 parser.add_argument(
278 "predicates",
279 nargs="*",
280 metavar="PREDICATE",
281 help='One or more predicates, e.g. "kind=function" "name~=validate".',
282 )
283 parser.add_argument(
284 "--commit", "-c",
285 dest="ref",
286 default=None,
287 metavar="REF",
288 help="Query a historical snapshot instead of HEAD.",
289 )
290 parser.add_argument(
291 "--all-commits",
292 action="store_true",
293 help=(
294 "Search across ALL commits (every branch). Enables temporal"
295 " hash= queries: find when a function body first appeared."
296 " Mutually exclusive with --commit."
297 ),
298 )
299 parser.add_argument(
300 "--since",
301 metavar="YYYY-MM-DD",
302 default=None,
303 help=(
304 "With --all-commits: only consider commits on or after this date."
305 ),
306 )
307 parser.add_argument(
308 "--until",
309 metavar="YYYY-MM-DD",
310 default=None,
311 help=(
312 "With --all-commits: only consider commits on or before this date."
313 ),
314 )
315 parser.add_argument(
316 "--max-commits",
317 type=int,
318 default=10_000,
319 metavar="N",
320 help=(
321 "With --all-commits: cap the number of commits walked"
322 " (default: 10000)."
323 ),
324 )
325 parser.add_argument(
326 "--limit",
327 type=int,
328 default=0,
329 metavar="N",
330 help="Cap the number of results returned (0 = unlimited).",
331 )
332 parser.add_argument(
333 "--sort",
334 default="file",
335 metavar="FIELD",
336 choices=sorted(_VALID_SORT_FIELDS),
337 help=(
338 f"Sort results by field: {', '.join(sorted(_VALID_SORT_FIELDS))}"
339 " (default: file)."
340 ),
341 )
342 parser.add_argument(
343 "--count",
344 action="store_true",
345 help="Print only the count of matching symbols — no symbol list.",
346 )
347 parser.add_argument(
348 "--unique-bodies",
349 action="store_true",
350 help=(
351 "Deduplicate by content_id — show only unique implementations."
352 " Turns muse query into a clone detector."
353 ),
354 )
355 parser.add_argument(
356 "--hashes",
357 dest="show_hashes",
358 action="store_true",
359 help="Include content hashes in output.",
360 )
361 parser.add_argument(
362 "--committed",
363 action="store_true",
364 help=(
365 "Query the last committed snapshot only — do not overlay working-tree changes."
366 " By default, files on disk take precedence over the committed snapshot."
367 ),
368 )
369 parser.add_argument(
370 "--json", "-j",
371 dest="json_out",
372 action="store_true",
373 help="Emit results as JSON for agent consumption (see _QueryJson / _AllCommitsJson schema).",
374 )
375 parser.set_defaults(func=run)
376
377 def run(args: argparse.Namespace) -> None:
378 """Query the symbol graph with a predicate DSL.
379
380 ``muse query`` is SQL for your codebase. Every predicate is evaluated
381 against the typed, content-addressed symbol graph — not raw text.
382
383 By default, working-tree files on disk take precedence over the committed
384 snapshot — so uncommitted edits are immediately visible without staging or
385 committing. Pass ``--committed`` to query the pure committed snapshot, or
386 ``--commit REF`` to query a specific historical snapshot (always committed).
387
388 New in v2.1:
389 ``size_gt=N`` / ``size_lt=N`` — filter by symbol body line count.
390 ``--count`` — emit only the result count.
391 ``--limit N`` — cap results (like SQL LIMIT).
392 ``--sort FIELD`` — sort by file, name, kind, lineno, or size.
393 ``--unique-bodies`` — deduplicate by content_id (clone detector mode).
394 ``--since / --until YYYY-MM-DD`` — temporal range for --all-commits.
395 ``--committed`` — skip working-tree overlay; query committed snapshot only.
396
397 JSON envelope (``--json`` / ``-j``)
398 ------------------------------------
399 Single-snapshot mode emits ``_QueryJson``:
400
401 - ``schema_version`` — Muse version string
402 - ``commit`` — short commit ID queried
403 - ``sort`` — sort field in effect
404 - ``unique_bodies`` — whether ``--unique-bodies`` was active
405 - ``truncated`` — ``true`` when ``--limit`` cut the result set
406 - ``results`` — list of matching symbol records
407 - ``exit_code`` — always ``0`` on this path
408 - ``duration_ms`` — wall-clock time for the command
409
410 ``--all-commits`` mode emits ``_AllCommitsJson``:
411
412 - ``schema_version`` — Muse version string
413 - ``mode`` — always ``"all-commits"``
414 - ``truncated`` — ``true`` when ``--max-commits`` was hit
415 - ``results`` — list of historical match records
416 - ``exit_code`` — always ``0`` on this path
417 - ``duration_ms`` — wall-clock time for the command
418 """
419 elapsed = start_timer()
420
421 predicates: list[str] = args.predicates
422 ref: str | None = args.ref
423 all_commits: bool = args.all_commits
424 committed_only: bool = args.committed
425 show_hashes: bool = args.show_hashes
426 json_out: bool = args.json_out
427 count_only: bool = args.count
428 limit: int = clamp_int(args.limit, 0, 10000, 'limit')
429 sort_by: str = args.sort
430 unique_bodies: bool = args.unique_bodies
431 max_commits: int = clamp_int(args.max_commits, 1, 100000, 'max_commits')
432
433 root = require_repo()
434 branch = read_current_branch(root)
435
436 if not predicates:
437 print("❌ At least one predicate is required.", file=sys.stderr)
438 raise SystemExit(ExitCode.USER_ERROR)
439
440 if all_commits and ref is not None:
441 print(
442 "❌ --all-commits and --commit are mutually exclusive.",
443 file=sys.stderr,
444 )
445 raise SystemExit(ExitCode.USER_ERROR)
446
447 if limit < 0:
448 print("❌ --limit must be >= 0.", file=sys.stderr)
449 raise SystemExit(ExitCode.USER_ERROR)
450
451 if max_commits < 1:
452 print("❌ --max-commits must be >= 1.", file=sys.stderr)
453 raise SystemExit(ExitCode.USER_ERROR)
454
455 # Parse --since / --until date filters.
456 since_date: datetime.date | None = (
457 parse_date_arg(args.since, "--since").date() if args.since else None
458 )
459 until_date: datetime.date | None = (
460 parse_date_arg(args.until, "--until").date() if args.until else None
461 )
462
463 if (since_date or until_date) and not all_commits:
464 print(
465 "❌ --since / --until require --all-commits.", file=sys.stderr
466 )
467 raise SystemExit(ExitCode.USER_ERROR)
468
469 # Parse predicates via the v2 grammar.
470 try:
471 combined_predicate: Predicate = parse_query(predicates)
472 except PredicateError as exc:
473 print(f"❌ {exc}", file=sys.stderr)
474 raise SystemExit(ExitCode.USER_ERROR)
475 filters: list[Predicate] = [combined_predicate]
476
477 # ── --all-commits mode ────────────────────────────────────────────────────
478 if all_commits:
479 historical, truncated = _query_all_commits(
480 root, filters, max_commits, since_date, until_date
481 )
482
483 if json_out:
484 print(json.dumps(_AllCommitsJson(
485 **make_envelope(elapsed),
486 mode="all-commits",
487 truncated=truncated,
488 results=[h.to_dict() for h in historical],
489 )))
490 return
491
492 if not historical:
493 pred_display = " AND ".join(sanitize_display(p) for p in predicates)
494 print(
495 f" (no symbols matching: {pred_display}"
496 f" [searched all commits])"
497 )
498 return
499
500 # Deduplicate for display: show unique addresses with first-seen commit.
501 seen_addrs: set[str] = set()
502 unique: list[_HistoricalMatch] = []
503 for h in historical:
504 if h.first_seen and h.address not in seen_addrs:
505 seen_addrs.add(h.address)
506 unique.append(h)
507
508 if limit > 0:
509 unique = unique[:limit]
510
511 if count_only:
512 print(len(unique))
513 return
514
515 pred_display = " AND ".join(sanitize_display(p) for p in predicates)
516 trunc_note = " ⚠️ truncated" if truncated else ""
517 print(
518 f"\n{len(unique)} unique symbol(s) matching"
519 f" [{pred_display}] across all commits{trunc_note}\n"
520 )
521 for h in unique:
522 date_str = h.commit.committed_at.strftime("%Y-%m-%d")
523 cid = short_id(h.commit.commit_id)
524 icon = _KIND_ICON.get(h.rec["kind"], h.rec["kind"])
525 hash_part = f" {short_id(h.rec['content_id'])}.." if show_hashes else ""
526 branch_label = (
527 f" [{h.commit.branch}]" if h.commit.branch else ""
528 )
529 print(
530 f" {h.address:<60} {icon:<8}"
531 f" first seen {cid} {date_str}"
532 f"{branch_label}{hash_part}"
533 )
534 return
535
536 # ── Single-snapshot mode (default) ────────────────────────────────────────
537 commit = resolve_commit_ref(root, branch, ref)
538 if commit is None:
539 print(
540 f"❌ Commit '{ref or 'HEAD'}' not found.", file=sys.stderr
541 )
542 raise SystemExit(ExitCode.USER_ERROR)
543
544 manifest = get_commit_snapshot_manifest(root, commit.commit_id) or {}
545 # Default: overlay working-tree files so uncommitted edits are visible.
546 # --committed pins to the snapshot without reading disk.
547 workdir = None if committed_only or ref is not None else root
548 symbol_map = symbols_for_snapshot(root, manifest, workdir=workdir)
549
550 # Collect matches.
551 matches: list[tuple[str, str, SymbolRecord]] = []
552 for file_path, tree in symbol_map.items():
553 for addr, rec in tree.items():
554 if all(f(file_path, rec) for f in filters):
555 matches.append((file_path, addr, rec))
556
557 # Sort.
558 matches.sort(key=_sort_key(sort_by))
559
560 # Unique-bodies: deduplicate by content_id.
561 if unique_bodies:
562 seen_cids: set[str] = set()
563 deduped: list[tuple[str, str, SymbolRecord]] = []
564 for fp, addr, rec in matches:
565 cid = rec["content_id"]
566 if cid not in seen_cids:
567 seen_cids.add(cid)
568 deduped.append((fp, addr, rec))
569 matches = deduped
570
571 # Apply limit.
572 limited = limit > 0 and len(matches) > limit
573 if limited:
574 matches = matches[:limit]
575
576 # Count-only output.
577 if count_only:
578 print(len(matches))
579 return
580
581 # JSON output.
582 if json_out:
583 result_records = []
584 for fp, addr, rec in matches:
585 result_records.append(
586 {
587 "address": addr,
588 "kind": rec["kind"],
589 "name": rec["name"],
590 "qualified_name": rec["qualified_name"],
591 "file": fp,
592 "lineno": rec["lineno"],
593 "end_lineno": rec["end_lineno"],
594 "size": rec["end_lineno"] - rec["lineno"],
595 "language": language_of(fp),
596 "content_id": rec["content_id"],
597 "body_hash": rec["body_hash"],
598 "signature_id": rec["signature_id"],
599 }
600 )
601 print(json.dumps(_QueryJson(
602 **make_envelope(elapsed),
603 commit=commit.commit_id,
604 sort=sort_by,
605 unique_bodies=unique_bodies,
606 truncated=limited,
607 results=result_records,
608 )))
609 return
610
611 # Human-readable output.
612 if not matches:
613 pred_str = " AND ".join(sanitize_display(p) for p in predicates)
614 print(f" (no symbols matching: {pred_str})")
615 return
616
617 files_seen: set[str] = set()
618 for fp, addr, rec in matches:
619 files_seen.add(fp)
620 icon = _KIND_ICON.get(rec["kind"], rec["kind"])
621 line = rec["lineno"]
622 size = rec["end_lineno"] - rec["lineno"]
623 hash_part = f" {short_id(rec['content_id'])}.." if show_hashes else ""
624 size_part = f" {size:>3}L" if sort_by == "size" else ""
625 print(f" {sanitize_display(addr):<60} {icon:<10} line {line:>4}{size_part}{hash_part}")
626
627 pred_display = " AND ".join(sanitize_display(p) for p in predicates)
628 trunc_note = f" (limited to {limit})" if limited else ""
629 print(
630 f"\n{len(matches)} match(es) across {len(files_seen)} file(s)"
631 f" [{pred_display}]{trunc_note}"
632 )
File History 1 commit
sha256:1900655993c83c4107067375548a7be823e471d2515830842f1a12cba4bd3cdf fix: unified object store migration — idempotent writes, JS… Sonnet 4.6 minor 28 days ago