gabriel / muse public
_code_query.py python
383 lines 12.8 KB
Raw
sha256:06dba78c2a78e251b580422dd1fd547f3c8357ff18f7709a860873b2d24dbbbf chore: bump version to 0.2.0rc14 Sonnet 4.6 patch 1 day ago
1 """Code-domain query evaluator for the Muse generic query engine.
2
3 Implements :data:`~muse.core.query_engine.CommitEvaluator` for the code domain.
4 Allows agents and humans to search the commit history for code changes::
5
6 muse code-query "symbol == 'my_function' and change == 'added'"
7 muse code-query "language == 'Python' and author == 'agent-x'"
8 muse code-query "agent_id == 'claude' and sem_ver_bump == 'major'"
9 muse code-query "file == 'src/core.py'"
10 muse code-query "change == 'added' and kind == 'class'"
11 muse code-query "symbol endswith _handler"
12
13 Query language
14 --------------
15
16 query = and_expr ( 'or' and_expr )*
17 and_expr = atom ( 'and' atom )*
18 atom = FIELD OP VALUE
19 FIELD = 'symbol' | 'file' | 'language' | 'kind' | 'change'
20 | 'author' | 'agent_id' | 'model_id' | 'toolchain_id'
21 | 'sem_ver_bump' | 'branch'
22 OP = '==' | '!=' | 'contains' | 'startswith' | 'endswith'
23 VALUE = QUOTED_STRING | UNQUOTED_WORD
24
25 Supported fields
26 ----------------
27
28 ``symbol`` Qualified symbol name (e.g. ``"MyClass.method"``).
29 ``file`` Workspace-relative file path.
30 ``language`` Language name (``"Python"``, ``"TypeScript"``…).
31 ``kind`` Symbol kind (``"function"``, ``"class"``, ``"method"``…).
32 ``change`` ``"added"``, ``"removed"``, or ``"modified"``.
33 ``author`` Commit author string.
34 ``agent_id`` Agent identity from commit provenance.
35 ``model_id`` Model ID from commit provenance.
36 ``toolchain_id`` Toolchain string from commit provenance.
37 ``sem_ver_bump`` Semantic version bump: ``"none"``, ``"patch"``,
38 ``"minor"``, ``"major"``.
39 ``branch`` Branch name.
40
41 Performance note
42 ----------------
43 The code evaluator reads ``commit.structured_delta`` — it never needs the
44 snapshot manifest. Callers should pass ``load_manifest=False`` to
45 :func:`~muse.core.query_engine.walk_history` to skip that I/O entirely.
46 """
47
48 import logging
49 import pathlib
50 import re
51 from dataclasses import dataclass
52 from typing import Literal, TypedDict, TypeIs, get_args
53
54 from muse.core.query_engine import CommitEvaluator, QueryMatch
55 from muse.core.commits import CommitRecord
56 from muse.domain import DomainOp, PatchOp
57 from muse.plugins.code._query import language_of
58 from muse.core.types import Manifest
59
60 class _SymbolMatch(TypedDict):
61 file: str
62 symbol: str
63 kind: str
64 change: str
65 language: str
66
67 logger = logging.getLogger(__name__)
68
69 def _is_patch_op(op: DomainOp) -> TypeIs[PatchOp]:
70 """Narrow *op* to :class:`~muse.domain.PatchOp` so mypy can see ``child_ops``."""
71 return op["op"] == "patch"
72
73 # ---------------------------------------------------------------------------
74 # Query AST types
75 # ---------------------------------------------------------------------------
76
77 CodeField = Literal[
78 "symbol", "file", "language", "kind", "change",
79 "author", "agent_id", "model_id", "toolchain_id",
80 "sem_ver_bump", "branch",
81 ]
82
83 CodeOp = Literal["==", "!=", "contains", "startswith", "endswith"]
84
85 @dataclass(frozen=True)
86 class Comparison:
87 """A single field OP value predicate."""
88
89 field: CodeField
90 op: CodeOp
91 value: str
92
93 @dataclass(frozen=True)
94 class AndExpr:
95 """Conjunction of predicates (all must match)."""
96
97 clauses: list[Comparison]
98
99 @dataclass(frozen=True)
100 class OrExpr:
101 """Disjunction of AND-expressions (any must match)."""
102
103 clauses: list[AndExpr]
104
105 # ---------------------------------------------------------------------------
106 # Tokeniser & parser
107 # ---------------------------------------------------------------------------
108
109 _TOKEN_RE = re.compile(
110 r"""
111 (?P<keyword>(?:or|and|contains|startswith|endswith)(?![A-Za-z0-9_.]))
112 |(?P<op>==|!=)
113 |(?P<quoted>"[^"]*"|'[^']*')
114 |(?P<word>[A-Za-z_][A-Za-z0-9_.]*)
115 """,
116 re.VERBOSE,
117 )
118
119 _VALID_FIELDS: frozenset[str] = frozenset(get_args(CodeField))
120 _VALID_OPS: frozenset[str] = frozenset(get_args(CodeOp))
121
122 def _is_code_field(tok: str) -> TypeIs[CodeField]:
123 return tok in _VALID_FIELDS
124
125 def _is_code_op(tok: str) -> TypeIs[CodeOp]:
126 return tok in _VALID_OPS
127
128 def _as_code_field(tok: str) -> CodeField:
129 """Validate and narrow *tok* to :data:`CodeField`; raises :exc:`ValueError` if invalid."""
130 if not _is_code_field(tok):
131 raise ValueError(f"Unknown field: {tok!r}. Valid: {sorted(_VALID_FIELDS)}")
132 return tok
133
134 def _as_code_op(tok: str) -> CodeOp:
135 """Validate and narrow *tok* to :data:`CodeOp`; raises :exc:`ValueError` if invalid."""
136 if not _is_code_op(tok):
137 raise ValueError(f"Unknown operator: {tok!r}. Valid: {sorted(_VALID_OPS)}")
138 return tok
139
140 def _tokenize(query: str) -> list[str]:
141 return [m.group() for m in _TOKEN_RE.finditer(query)]
142
143 def _parse_query(query: str) -> OrExpr:
144 """Parse a query string into an :class:`OrExpr` AST."""
145 tokens = _tokenize(query.strip())
146 pos = 0
147
148 def peek() -> str | None:
149 return tokens[pos] if pos < len(tokens) else None
150
151 def consume() -> str:
152 nonlocal pos
153 tok = tokens[pos]
154 pos += 1
155 return tok
156
157 def parse_atom() -> Comparison:
158 field_tok = consume()
159 validated_field = _as_code_field(field_tok)
160 op_tok = consume()
161 validated_op = _as_code_op(op_tok)
162 val_tok = consume()
163 if val_tok.startswith(("'", '"')):
164 val_tok = val_tok[1:-1]
165 return Comparison(
166 field=validated_field,
167 op=validated_op,
168 value=val_tok,
169 )
170
171 def parse_and() -> AndExpr:
172 clauses: list[Comparison] = [parse_atom()]
173 while peek() == "and":
174 consume()
175 clauses.append(parse_atom())
176 return AndExpr(clauses=clauses)
177
178 def parse_or() -> OrExpr:
179 clauses: list[AndExpr] = [parse_and()]
180 while peek() == "or":
181 consume()
182 clauses.append(parse_and())
183 return OrExpr(clauses=clauses)
184
185 return parse_or()
186
187 # ---------------------------------------------------------------------------
188 # Evaluator
189 # ---------------------------------------------------------------------------
190
191 def _match_op(actual: str, op: CodeOp, expected: str) -> bool:
192 """Apply *op* to *actual* and *expected* strings (case-insensitive where sensible)."""
193 if op == "==":
194 return actual == expected
195 if op == "!=":
196 return actual != expected
197 if op == "contains":
198 return expected.lower() in actual.lower()
199 if op == "startswith":
200 return actual.lower().startswith(expected.lower())
201 # op == "endswith"
202 return actual.lower().endswith(expected.lower())
203
204 def _commit_matches_comparison(
205 comparison: Comparison,
206 commit: CommitRecord,
207 manifest: Manifest,
208 root: pathlib.Path,
209 symbol_matches: list[_SymbolMatch],
210 ) -> bool:
211 """Return True if *commit* + its symbols satisfy *comparison*.
212
213 For symbol/file/language/kind/change fields, each (symbol, file) pair
214 that matches is appended to *symbol_matches* for result detail.
215 The ``manifest`` argument is accepted to satisfy the
216 :data:`~muse.core.query_engine.CommitEvaluator` protocol but is unused —
217 all relevant data lives in ``commit.structured_delta``.
218 """
219 f = comparison.field
220 op = comparison.op
221 v = comparison.value
222
223 # Commit-level fields — no delta traversal needed.
224 if f == "author":
225 return _match_op(commit.author, op, v)
226 if f == "agent_id":
227 return _match_op(commit.agent_id, op, v)
228 if f == "model_id":
229 return _match_op(commit.model_id, op, v)
230 if f == "toolchain_id":
231 return _match_op(commit.toolchain_id, op, v)
232 if f == "sem_ver_bump":
233 return _match_op(commit.sem_ver_bump, op, v)
234 if f == "branch":
235 return _match_op(commit.branch, op, v)
236
237 # Symbol/file-level fields — iterate the structured delta.
238 delta = commit.structured_delta
239 if delta is None:
240 return False
241
242 hit = False
243 for op_rec in delta.get("ops", []):
244 op_type: str = op_rec.get("op", "")
245 address: str = op_rec.get("address", "")
246
247 if "::" in address:
248 file_path, symbol_name = address.split("::", 1)
249 else:
250 file_path = address
251 symbol_name = ""
252
253 lang = language_of(file_path)
254 change_type = (
255 "added" if op_type == "insert"
256 else "removed" if op_type == "delete"
257 else "modified"
258 )
259
260 # PatchOps may carry child_ops for the symbols they modify.
261 child_ops: list[DomainOp] = op_rec["child_ops"] if _is_patch_op(op_rec) else []
262 all_ops: list[DomainOp] = [op_rec] + child_ops
263
264 for rec in all_ops:
265 rec_address: str = str(rec.get("address", address))
266 if "::" in rec_address:
267 rec_file, rec_symbol = rec_address.split("::", 1)
268 else:
269 rec_file = rec_address
270 rec_symbol = ""
271
272 rec_kind: str = str(rec.get("kind", ""))
273 rec_op_type: str = str(rec.get("op", ""))
274 rec_change = (
275 "added" if rec_op_type == "insert"
276 else "removed" if rec_op_type == "delete"
277 else "modified"
278 )
279
280 field_val = {
281 "symbol": rec_symbol or symbol_name,
282 "file": rec_file or file_path,
283 "language": lang,
284 "kind": rec_kind,
285 "change": rec_change or change_type,
286 }.get(f, "")
287
288 if _match_op(field_val, op, v):
289 hit = True
290 symbol_matches.append({
291 "file": rec_file or file_path,
292 "symbol": rec_symbol or symbol_name,
293 "kind": rec_kind,
294 "change": rec_change or change_type,
295 "language": lang,
296 })
297
298 return hit
299
300 def build_evaluator(query: str) -> CommitEvaluator:
301 """Parse *query* and return a :data:`CommitEvaluator` for :func:`~muse.core.query_engine.walk_history`.
302
303 The evaluator is a single-pass closure: it evaluates each commit once,
304 collecting both the match result and the per-symbol detail in the same
305 traversal. Commit-level fields (``author``, ``agent_id``, etc.) produce
306 one :class:`~muse.core.query_engine.QueryMatch` per matching commit;
307 symbol-level fields produce one match per matching symbol (capped at 20
308 per commit to prevent runaway output on large deltas).
309
310 Args:
311 query: A query string in the code query DSL.
312
313 Returns:
314 A callable that can be passed to :func:`~muse.core.query_engine.walk_history`.
315
316 Raises:
317 ValueError: If the query cannot be parsed.
318 """
319 ast = _parse_query(query)
320
321 def evaluator(
322 commit: CommitRecord,
323 manifest: Manifest,
324 root: pathlib.Path,
325 ) -> list[QueryMatch]:
326 symbol_matches: list[_SymbolMatch] = []
327 or_matched = False
328
329 # First matching OR clause wins; we stop early.
330 for and_expr in ast.clauses:
331 clause_symbols: list[_SymbolMatch] = []
332 all_match = all(
333 _commit_matches_comparison(cmp, commit, manifest, root, clause_symbols)
334 for cmp in and_expr.clauses
335 )
336 if all_match:
337 symbol_matches.extend(clause_symbols)
338 or_matched = True
339 break
340
341 if not or_matched:
342 return []
343
344 matches: list[QueryMatch] = []
345
346 if symbol_matches:
347 # Symbol-level matches — one QueryMatch per symbol (capped at 20).
348 for sym in symbol_matches[:20]:
349 detail = sym.get("symbol") or sym.get("file", "?")
350 change = sym.get("change", "")
351 if change:
352 detail = f"{detail} ({change})"
353 m = QueryMatch(
354 commit_id=commit.commit_id,
355 author=commit.author,
356 committed_at=commit.committed_at.isoformat(),
357 branch=commit.branch,
358 detail=detail,
359 extra={k: v for k, v in sym.items()},
360 )
361 if commit.agent_id:
362 m["agent_id"] = commit.agent_id
363 matches.append(m)
364 else:
365 # Commit-level match (query touched only commit fields, or the
366 # matching OR clause was a commit-level clause in a mixed query).
367 m = QueryMatch(
368 commit_id=commit.commit_id,
369 author=commit.author,
370 committed_at=commit.committed_at.isoformat(),
371 branch=commit.branch,
372 detail=commit.message[:80],
373 extra={},
374 )
375 if commit.agent_id:
376 m["agent_id"] = commit.agent_id
377 if commit.model_id:
378 m["model_id"] = commit.model_id
379 matches.append(m)
380
381 return matches
382
383 return evaluator
File History 1 commit
sha256:06dba78c2a78e251b580422dd1fd547f3c8357ff18f7709a860873b2d24dbbbf chore: bump version to 0.2.0rc14 Sonnet 4.6 patch 1 day ago