_code_query.py
python
sha256:06dba78c2a78e251b580422dd1fd547f3c8357ff18f7709a860873b2d24dbbbf
chore: bump version to 0.2.0rc14
Sonnet 4.6
patch
1 day ago
| 1 | """Code-domain query evaluator for the Muse generic query engine. |
| 2 | |
| 3 | Implements :data:`~muse.core.query_engine.CommitEvaluator` for the code domain. |
| 4 | Allows agents and humans to search the commit history for code changes:: |
| 5 | |
| 6 | muse code-query "symbol == 'my_function' and change == 'added'" |
| 7 | muse code-query "language == 'Python' and author == 'agent-x'" |
| 8 | muse code-query "agent_id == 'claude' and sem_ver_bump == 'major'" |
| 9 | muse code-query "file == 'src/core.py'" |
| 10 | muse code-query "change == 'added' and kind == 'class'" |
| 11 | muse code-query "symbol endswith _handler" |
| 12 | |
| 13 | Query language |
| 14 | -------------- |
| 15 | |
| 16 | query = and_expr ( 'or' and_expr )* |
| 17 | and_expr = atom ( 'and' atom )* |
| 18 | atom = FIELD OP VALUE |
| 19 | FIELD = 'symbol' | 'file' | 'language' | 'kind' | 'change' |
| 20 | | 'author' | 'agent_id' | 'model_id' | 'toolchain_id' |
| 21 | | 'sem_ver_bump' | 'branch' |
| 22 | OP = '==' | '!=' | 'contains' | 'startswith' | 'endswith' |
| 23 | VALUE = QUOTED_STRING | UNQUOTED_WORD |
| 24 | |
| 25 | Supported fields |
| 26 | ---------------- |
| 27 | |
| 28 | ``symbol`` Qualified symbol name (e.g. ``"MyClass.method"``). |
| 29 | ``file`` Workspace-relative file path. |
| 30 | ``language`` Language name (``"Python"``, ``"TypeScript"``…). |
| 31 | ``kind`` Symbol kind (``"function"``, ``"class"``, ``"method"``…). |
| 32 | ``change`` ``"added"``, ``"removed"``, or ``"modified"``. |
| 33 | ``author`` Commit author string. |
| 34 | ``agent_id`` Agent identity from commit provenance. |
| 35 | ``model_id`` Model ID from commit provenance. |
| 36 | ``toolchain_id`` Toolchain string from commit provenance. |
| 37 | ``sem_ver_bump`` Semantic version bump: ``"none"``, ``"patch"``, |
| 38 | ``"minor"``, ``"major"``. |
| 39 | ``branch`` Branch name. |
| 40 | |
| 41 | Performance note |
| 42 | ---------------- |
| 43 | The code evaluator reads ``commit.structured_delta`` — it never needs the |
| 44 | snapshot manifest. Callers should pass ``load_manifest=False`` to |
| 45 | :func:`~muse.core.query_engine.walk_history` to skip that I/O entirely. |
| 46 | """ |
| 47 | |
| 48 | import logging |
| 49 | import pathlib |
| 50 | import re |
| 51 | from dataclasses import dataclass |
| 52 | from typing import Literal, TypedDict, TypeIs, get_args |
| 53 | |
| 54 | from muse.core.query_engine import CommitEvaluator, QueryMatch |
| 55 | from muse.core.commits import CommitRecord |
| 56 | from muse.domain import DomainOp, PatchOp |
| 57 | from muse.plugins.code._query import language_of |
| 58 | from muse.core.types import Manifest |
| 59 | |
| 60 | class _SymbolMatch(TypedDict): |
| 61 | file: str |
| 62 | symbol: str |
| 63 | kind: str |
| 64 | change: str |
| 65 | language: str |
| 66 | |
| 67 | logger = logging.getLogger(__name__) |
| 68 | |
| 69 | def _is_patch_op(op: DomainOp) -> TypeIs[PatchOp]: |
| 70 | """Narrow *op* to :class:`~muse.domain.PatchOp` so mypy can see ``child_ops``.""" |
| 71 | return op["op"] == "patch" |
| 72 | |
| 73 | # --------------------------------------------------------------------------- |
| 74 | # Query AST types |
| 75 | # --------------------------------------------------------------------------- |
| 76 | |
| 77 | CodeField = Literal[ |
| 78 | "symbol", "file", "language", "kind", "change", |
| 79 | "author", "agent_id", "model_id", "toolchain_id", |
| 80 | "sem_ver_bump", "branch", |
| 81 | ] |
| 82 | |
| 83 | CodeOp = Literal["==", "!=", "contains", "startswith", "endswith"] |
| 84 | |
| 85 | @dataclass(frozen=True) |
| 86 | class Comparison: |
| 87 | """A single field OP value predicate.""" |
| 88 | |
| 89 | field: CodeField |
| 90 | op: CodeOp |
| 91 | value: str |
| 92 | |
| 93 | @dataclass(frozen=True) |
| 94 | class AndExpr: |
| 95 | """Conjunction of predicates (all must match).""" |
| 96 | |
| 97 | clauses: list[Comparison] |
| 98 | |
| 99 | @dataclass(frozen=True) |
| 100 | class OrExpr: |
| 101 | """Disjunction of AND-expressions (any must match).""" |
| 102 | |
| 103 | clauses: list[AndExpr] |
| 104 | |
| 105 | # --------------------------------------------------------------------------- |
| 106 | # Tokeniser & parser |
| 107 | # --------------------------------------------------------------------------- |
| 108 | |
| 109 | _TOKEN_RE = re.compile( |
| 110 | r""" |
| 111 | (?P<keyword>(?:or|and|contains|startswith|endswith)(?![A-Za-z0-9_.])) |
| 112 | |(?P<op>==|!=) |
| 113 | |(?P<quoted>"[^"]*"|'[^']*') |
| 114 | |(?P<word>[A-Za-z_][A-Za-z0-9_.]*) |
| 115 | """, |
| 116 | re.VERBOSE, |
| 117 | ) |
| 118 | |
| 119 | _VALID_FIELDS: frozenset[str] = frozenset(get_args(CodeField)) |
| 120 | _VALID_OPS: frozenset[str] = frozenset(get_args(CodeOp)) |
| 121 | |
| 122 | def _is_code_field(tok: str) -> TypeIs[CodeField]: |
| 123 | return tok in _VALID_FIELDS |
| 124 | |
| 125 | def _is_code_op(tok: str) -> TypeIs[CodeOp]: |
| 126 | return tok in _VALID_OPS |
| 127 | |
| 128 | def _as_code_field(tok: str) -> CodeField: |
| 129 | """Validate and narrow *tok* to :data:`CodeField`; raises :exc:`ValueError` if invalid.""" |
| 130 | if not _is_code_field(tok): |
| 131 | raise ValueError(f"Unknown field: {tok!r}. Valid: {sorted(_VALID_FIELDS)}") |
| 132 | return tok |
| 133 | |
| 134 | def _as_code_op(tok: str) -> CodeOp: |
| 135 | """Validate and narrow *tok* to :data:`CodeOp`; raises :exc:`ValueError` if invalid.""" |
| 136 | if not _is_code_op(tok): |
| 137 | raise ValueError(f"Unknown operator: {tok!r}. Valid: {sorted(_VALID_OPS)}") |
| 138 | return tok |
| 139 | |
| 140 | def _tokenize(query: str) -> list[str]: |
| 141 | return [m.group() for m in _TOKEN_RE.finditer(query)] |
| 142 | |
| 143 | def _parse_query(query: str) -> OrExpr: |
| 144 | """Parse a query string into an :class:`OrExpr` AST.""" |
| 145 | tokens = _tokenize(query.strip()) |
| 146 | pos = 0 |
| 147 | |
| 148 | def peek() -> str | None: |
| 149 | return tokens[pos] if pos < len(tokens) else None |
| 150 | |
| 151 | def consume() -> str: |
| 152 | nonlocal pos |
| 153 | tok = tokens[pos] |
| 154 | pos += 1 |
| 155 | return tok |
| 156 | |
| 157 | def parse_atom() -> Comparison: |
| 158 | field_tok = consume() |
| 159 | validated_field = _as_code_field(field_tok) |
| 160 | op_tok = consume() |
| 161 | validated_op = _as_code_op(op_tok) |
| 162 | val_tok = consume() |
| 163 | if val_tok.startswith(("'", '"')): |
| 164 | val_tok = val_tok[1:-1] |
| 165 | return Comparison( |
| 166 | field=validated_field, |
| 167 | op=validated_op, |
| 168 | value=val_tok, |
| 169 | ) |
| 170 | |
| 171 | def parse_and() -> AndExpr: |
| 172 | clauses: list[Comparison] = [parse_atom()] |
| 173 | while peek() == "and": |
| 174 | consume() |
| 175 | clauses.append(parse_atom()) |
| 176 | return AndExpr(clauses=clauses) |
| 177 | |
| 178 | def parse_or() -> OrExpr: |
| 179 | clauses: list[AndExpr] = [parse_and()] |
| 180 | while peek() == "or": |
| 181 | consume() |
| 182 | clauses.append(parse_and()) |
| 183 | return OrExpr(clauses=clauses) |
| 184 | |
| 185 | return parse_or() |
| 186 | |
| 187 | # --------------------------------------------------------------------------- |
| 188 | # Evaluator |
| 189 | # --------------------------------------------------------------------------- |
| 190 | |
| 191 | def _match_op(actual: str, op: CodeOp, expected: str) -> bool: |
| 192 | """Apply *op* to *actual* and *expected* strings (case-insensitive where sensible).""" |
| 193 | if op == "==": |
| 194 | return actual == expected |
| 195 | if op == "!=": |
| 196 | return actual != expected |
| 197 | if op == "contains": |
| 198 | return expected.lower() in actual.lower() |
| 199 | if op == "startswith": |
| 200 | return actual.lower().startswith(expected.lower()) |
| 201 | # op == "endswith" |
| 202 | return actual.lower().endswith(expected.lower()) |
| 203 | |
| 204 | def _commit_matches_comparison( |
| 205 | comparison: Comparison, |
| 206 | commit: CommitRecord, |
| 207 | manifest: Manifest, |
| 208 | root: pathlib.Path, |
| 209 | symbol_matches: list[_SymbolMatch], |
| 210 | ) -> bool: |
| 211 | """Return True if *commit* + its symbols satisfy *comparison*. |
| 212 | |
| 213 | For symbol/file/language/kind/change fields, each (symbol, file) pair |
| 214 | that matches is appended to *symbol_matches* for result detail. |
| 215 | The ``manifest`` argument is accepted to satisfy the |
| 216 | :data:`~muse.core.query_engine.CommitEvaluator` protocol but is unused — |
| 217 | all relevant data lives in ``commit.structured_delta``. |
| 218 | """ |
| 219 | f = comparison.field |
| 220 | op = comparison.op |
| 221 | v = comparison.value |
| 222 | |
| 223 | # Commit-level fields — no delta traversal needed. |
| 224 | if f == "author": |
| 225 | return _match_op(commit.author, op, v) |
| 226 | if f == "agent_id": |
| 227 | return _match_op(commit.agent_id, op, v) |
| 228 | if f == "model_id": |
| 229 | return _match_op(commit.model_id, op, v) |
| 230 | if f == "toolchain_id": |
| 231 | return _match_op(commit.toolchain_id, op, v) |
| 232 | if f == "sem_ver_bump": |
| 233 | return _match_op(commit.sem_ver_bump, op, v) |
| 234 | if f == "branch": |
| 235 | return _match_op(commit.branch, op, v) |
| 236 | |
| 237 | # Symbol/file-level fields — iterate the structured delta. |
| 238 | delta = commit.structured_delta |
| 239 | if delta is None: |
| 240 | return False |
| 241 | |
| 242 | hit = False |
| 243 | for op_rec in delta.get("ops", []): |
| 244 | op_type: str = op_rec.get("op", "") |
| 245 | address: str = op_rec.get("address", "") |
| 246 | |
| 247 | if "::" in address: |
| 248 | file_path, symbol_name = address.split("::", 1) |
| 249 | else: |
| 250 | file_path = address |
| 251 | symbol_name = "" |
| 252 | |
| 253 | lang = language_of(file_path) |
| 254 | change_type = ( |
| 255 | "added" if op_type == "insert" |
| 256 | else "removed" if op_type == "delete" |
| 257 | else "modified" |
| 258 | ) |
| 259 | |
| 260 | # PatchOps may carry child_ops for the symbols they modify. |
| 261 | child_ops: list[DomainOp] = op_rec["child_ops"] if _is_patch_op(op_rec) else [] |
| 262 | all_ops: list[DomainOp] = [op_rec] + child_ops |
| 263 | |
| 264 | for rec in all_ops: |
| 265 | rec_address: str = str(rec.get("address", address)) |
| 266 | if "::" in rec_address: |
| 267 | rec_file, rec_symbol = rec_address.split("::", 1) |
| 268 | else: |
| 269 | rec_file = rec_address |
| 270 | rec_symbol = "" |
| 271 | |
| 272 | rec_kind: str = str(rec.get("kind", "")) |
| 273 | rec_op_type: str = str(rec.get("op", "")) |
| 274 | rec_change = ( |
| 275 | "added" if rec_op_type == "insert" |
| 276 | else "removed" if rec_op_type == "delete" |
| 277 | else "modified" |
| 278 | ) |
| 279 | |
| 280 | field_val = { |
| 281 | "symbol": rec_symbol or symbol_name, |
| 282 | "file": rec_file or file_path, |
| 283 | "language": lang, |
| 284 | "kind": rec_kind, |
| 285 | "change": rec_change or change_type, |
| 286 | }.get(f, "") |
| 287 | |
| 288 | if _match_op(field_val, op, v): |
| 289 | hit = True |
| 290 | symbol_matches.append({ |
| 291 | "file": rec_file or file_path, |
| 292 | "symbol": rec_symbol or symbol_name, |
| 293 | "kind": rec_kind, |
| 294 | "change": rec_change or change_type, |
| 295 | "language": lang, |
| 296 | }) |
| 297 | |
| 298 | return hit |
| 299 | |
| 300 | def build_evaluator(query: str) -> CommitEvaluator: |
| 301 | """Parse *query* and return a :data:`CommitEvaluator` for :func:`~muse.core.query_engine.walk_history`. |
| 302 | |
| 303 | The evaluator is a single-pass closure: it evaluates each commit once, |
| 304 | collecting both the match result and the per-symbol detail in the same |
| 305 | traversal. Commit-level fields (``author``, ``agent_id``, etc.) produce |
| 306 | one :class:`~muse.core.query_engine.QueryMatch` per matching commit; |
| 307 | symbol-level fields produce one match per matching symbol (capped at 20 |
| 308 | per commit to prevent runaway output on large deltas). |
| 309 | |
| 310 | Args: |
| 311 | query: A query string in the code query DSL. |
| 312 | |
| 313 | Returns: |
| 314 | A callable that can be passed to :func:`~muse.core.query_engine.walk_history`. |
| 315 | |
| 316 | Raises: |
| 317 | ValueError: If the query cannot be parsed. |
| 318 | """ |
| 319 | ast = _parse_query(query) |
| 320 | |
| 321 | def evaluator( |
| 322 | commit: CommitRecord, |
| 323 | manifest: Manifest, |
| 324 | root: pathlib.Path, |
| 325 | ) -> list[QueryMatch]: |
| 326 | symbol_matches: list[_SymbolMatch] = [] |
| 327 | or_matched = False |
| 328 | |
| 329 | # First matching OR clause wins; we stop early. |
| 330 | for and_expr in ast.clauses: |
| 331 | clause_symbols: list[_SymbolMatch] = [] |
| 332 | all_match = all( |
| 333 | _commit_matches_comparison(cmp, commit, manifest, root, clause_symbols) |
| 334 | for cmp in and_expr.clauses |
| 335 | ) |
| 336 | if all_match: |
| 337 | symbol_matches.extend(clause_symbols) |
| 338 | or_matched = True |
| 339 | break |
| 340 | |
| 341 | if not or_matched: |
| 342 | return [] |
| 343 | |
| 344 | matches: list[QueryMatch] = [] |
| 345 | |
| 346 | if symbol_matches: |
| 347 | # Symbol-level matches — one QueryMatch per symbol (capped at 20). |
| 348 | for sym in symbol_matches[:20]: |
| 349 | detail = sym.get("symbol") or sym.get("file", "?") |
| 350 | change = sym.get("change", "") |
| 351 | if change: |
| 352 | detail = f"{detail} ({change})" |
| 353 | m = QueryMatch( |
| 354 | commit_id=commit.commit_id, |
| 355 | author=commit.author, |
| 356 | committed_at=commit.committed_at.isoformat(), |
| 357 | branch=commit.branch, |
| 358 | detail=detail, |
| 359 | extra={k: v for k, v in sym.items()}, |
| 360 | ) |
| 361 | if commit.agent_id: |
| 362 | m["agent_id"] = commit.agent_id |
| 363 | matches.append(m) |
| 364 | else: |
| 365 | # Commit-level match (query touched only commit fields, or the |
| 366 | # matching OR clause was a commit-level clause in a mixed query). |
| 367 | m = QueryMatch( |
| 368 | commit_id=commit.commit_id, |
| 369 | author=commit.author, |
| 370 | committed_at=commit.committed_at.isoformat(), |
| 371 | branch=commit.branch, |
| 372 | detail=commit.message[:80], |
| 373 | extra={}, |
| 374 | ) |
| 375 | if commit.agent_id: |
| 376 | m["agent_id"] = commit.agent_id |
| 377 | if commit.model_id: |
| 378 | m["model_id"] = commit.model_id |
| 379 | matches.append(m) |
| 380 | |
| 381 | return matches |
| 382 | |
| 383 | return evaluator |
File History
1 commit
sha256:06dba78c2a78e251b580422dd1fd547f3c8357ff18f7709a860873b2d24dbbbf
chore: bump version to 0.2.0rc14
Sonnet 4.6
patch
1 day ago