_predicate.py
python
sha256:81ae324db5ad375fbfe4834c6fcb378312cafad3cc92dec5d3e5c427306621a2
fix: remove commit_exists filter from have anchors — server…
Sonnet 4.6
patch
21 days ago
| 1 | """Predicate DSL parser for ``muse query`` and ``muse query-history``. |
| 2 | |
| 3 | Grammar (v2) |
| 4 | ============ |
| 5 | |
| 6 | .. code-block:: text |
| 7 | |
| 8 | expr = or_expr |
| 9 | or_expr = and_expr ( "OR" and_expr )* |
| 10 | and_expr = not_expr ( and_expr )* # implicit AND / explicit AND |
| 11 | not_expr = "NOT" primary | primary |
| 12 | primary = "(" expr ")" | atom |
| 13 | atom = KEY OP VALUE |
| 14 | |
| 15 | KEY = [a-zA-Z_][a-zA-Z_0-9]* |
| 16 | OP = "~=" | "^=" | "$=" | ">=" | "<=" | "!=" | "=" |
| 17 | VALUE = double-quoted-string | bare-word |
| 18 | |
| 19 | Supported keys |
| 20 | -------------- |
| 21 | |
| 22 | Snapshot-local (available with or without ``--all-commits``): |
| 23 | |
| 24 | kind function | class | method | variable | import | … |
| 25 | language Python | Go | Rust | … |
| 26 | name bare symbol name |
| 27 | qualified_name dotted name (e.g. User.save) |
| 28 | file file path |
| 29 | hash content_id prefix (exact-body match) |
| 30 | body_hash body_hash prefix |
| 31 | signature_id signature_id prefix |
| 32 | lineno_gt symbol starts after line N (integer) |
| 33 | lineno_lt symbol starts before line N (integer) |
| 34 | |
| 35 | Example queries |
| 36 | --------------- |
| 37 | |
| 38 | kind=function language=Python name~=validate |
| 39 | (kind=function OR kind=method) name^=_ |
| 40 | NOT kind=import file~=billing |
| 41 | kind=class name~=Service language=Python |
| 42 | lineno_gt=100 lineno_lt=200 file=src/billing.py |
| 43 | |
| 44 | The parser is a hand-written recursive descent parser — no external |
| 45 | dependencies, no regex-based hacks. All parsing errors raise |
| 46 | ``PredicateError`` with a human-readable message including the position |
| 47 | in the input string. |
| 48 | |
| 49 | New keys (v2.1) |
| 50 | --------------- |
| 51 | |
| 52 | size_gt symbol body exceeds N lines (end_lineno - lineno > N) |
| 53 | size_lt symbol body shorter than N lines (end_lineno - lineno < N) |
| 54 | |
| 55 | Example:: |
| 56 | |
| 57 | kind=function size_gt=50 # oversized functions (refactor targets) |
| 58 | kind=function size_lt=3 NOT name^=_ # suspiciously tiny public functions |
| 59 | |
| 60 | Security note |
| 61 | ------------- |
| 62 | Expression nesting is capped at ``_MAX_DEPTH`` (64) levels to prevent |
| 63 | stack overflow from crafted predicate strings. |
| 64 | """ |
| 65 | |
| 66 | import re |
| 67 | import logging |
| 68 | from collections.abc import Callable |
| 69 | |
| 70 | from muse.plugins.code._query import language_of |
| 71 | from muse.plugins.code.ast_parser import SymbolRecord |
| 72 | |
| 73 | logger = logging.getLogger(__name__) |
| 74 | |
| 75 | # Maximum parenthesis nesting depth — prevents stack overflow from crafted input. |
| 76 | _MAX_DEPTH = 64 |
| 77 | |
| 78 | # Signature: (file_path: str, rec: SymbolRecord) -> bool |
| 79 | Predicate = Callable[[str, SymbolRecord], bool] |
| 80 | |
| 81 | class PredicateError(ValueError): |
| 82 | """Raised when a predicate string cannot be parsed or evaluated.""" |
| 83 | |
| 84 | # --------------------------------------------------------------------------- |
| 85 | # Tokeniser |
| 86 | # --------------------------------------------------------------------------- |
| 87 | |
| 88 | _TOKEN_SPEC = [ |
| 89 | ("LPAREN", r"\("), |
| 90 | ("RPAREN", r"\)"), |
| 91 | ("OR", r"\bOR\b"), |
| 92 | ("NOT", r"\bNOT\b"), |
| 93 | ("AND", r"\bAND\b"), |
| 94 | ("ATOM", r'[a-zA-Z_][a-zA-Z_0-9]*(?:~=|\^=|\$=|>=|<=|!=|=)"[^"]*"'), |
| 95 | ("ATOM", r'[a-zA-Z_][a-zA-Z_0-9]*(?:~=|\^=|\$=|>=|<=|!=|=)[^\s()]+'), |
| 96 | ("WS", r"\s+"), |
| 97 | ] |
| 98 | |
| 99 | _TOKEN_RE = re.compile("|".join(f"(?P<{name}_{i}>{pat})" for i, (name, pat) in enumerate(_TOKEN_SPEC))) |
| 100 | |
| 101 | class _Token: |
| 102 | __slots__ = ("kind", "value", "pos") |
| 103 | |
| 104 | def __init__(self, kind: str, value: str, pos: int) -> None: |
| 105 | self.kind = kind |
| 106 | self.value = value |
| 107 | self.pos = pos |
| 108 | |
| 109 | def __repr__(self) -> str: |
| 110 | return f"Token({self.kind!r}, {self.value!r})" |
| 111 | |
| 112 | def _tokenise(text: str) -> list[_Token]: |
| 113 | tokens: list[_Token] = [] |
| 114 | pos = 0 |
| 115 | while pos < len(text): |
| 116 | m = _TOKEN_RE.match(text, pos) |
| 117 | if m is None: |
| 118 | raise PredicateError( |
| 119 | f"Unexpected character at position {pos}: {text[pos]!r}" |
| 120 | ) |
| 121 | kind_raw = m.lastgroup or "" |
| 122 | # Strip the numeric suffix we added. |
| 123 | kind = kind_raw.rsplit("_", 1)[0] |
| 124 | if kind != "WS": |
| 125 | tokens.append(_Token(kind, m.group(), pos)) |
| 126 | pos = m.end() |
| 127 | return tokens |
| 128 | |
| 129 | # --------------------------------------------------------------------------- |
| 130 | # Atom parser |
| 131 | # --------------------------------------------------------------------------- |
| 132 | |
| 133 | _OP_RE = re.compile(r"(~=|\^=|\$=|>=|<=|!=|=)") |
| 134 | _VALID_KEYS = frozenset({ |
| 135 | "kind", "language", "name", "qualified_name", "file", |
| 136 | "hash", "body_hash", "signature_id", |
| 137 | "lineno_gt", "lineno_lt", |
| 138 | "size_gt", "size_lt", |
| 139 | }) |
| 140 | |
| 141 | def _parse_int_value(key: str, value: str) -> int: |
| 142 | """Parse *value* as an integer, raising ``PredicateError`` on failure.""" |
| 143 | try: |
| 144 | return int(value) |
| 145 | except ValueError: |
| 146 | raise PredicateError(f"{key} requires an integer value (got '{value}').") |
| 147 | |
| 148 | def _parse_atom(atom_str: str) -> Predicate: |
| 149 | """Parse a single ``key OP value`` atom into a predicate callable.""" |
| 150 | m = _OP_RE.search(atom_str) |
| 151 | if m is None: |
| 152 | raise PredicateError( |
| 153 | f"Cannot parse predicate '{atom_str}'. " |
| 154 | "Expected: key=value, key~=value, key^=value, key$=value, key!=value." |
| 155 | ) |
| 156 | op = m.group(1) |
| 157 | key = atom_str[:m.start()].strip() |
| 158 | value = atom_str[m.end():] |
| 159 | # Strip surrounding double quotes. |
| 160 | if value.startswith('"') and value.endswith('"'): |
| 161 | value = value[1:-1] |
| 162 | if key not in _VALID_KEYS: |
| 163 | raise PredicateError( |
| 164 | f"Unknown predicate key '{key}'. " |
| 165 | f"Valid keys: {', '.join(sorted(_VALID_KEYS))}." |
| 166 | ) |
| 167 | |
| 168 | value_lower = value.lower() |
| 169 | |
| 170 | # ── String-match keys ──────────────────────────────────────────────────── |
| 171 | |
| 172 | def _str_match(field: str) -> bool: |
| 173 | f = field.lower() |
| 174 | if op == "=": |
| 175 | return f == value_lower |
| 176 | if op == "~=": |
| 177 | return value_lower in f |
| 178 | if op == "^=": |
| 179 | return f.startswith(value_lower) |
| 180 | if op == "$=": |
| 181 | return f.endswith(value_lower) |
| 182 | if op == "!=": |
| 183 | return f != value_lower |
| 184 | return False |
| 185 | |
| 186 | if key == "kind": |
| 187 | def _kind(file_path: str, rec: SymbolRecord) -> bool: |
| 188 | return _str_match(rec["kind"]) |
| 189 | return _kind |
| 190 | |
| 191 | if key == "language": |
| 192 | def _language(file_path: str, rec: SymbolRecord) -> bool: |
| 193 | return _str_match(language_of(file_path)) |
| 194 | return _language |
| 195 | |
| 196 | if key == "name": |
| 197 | def _name(file_path: str, rec: SymbolRecord) -> bool: |
| 198 | return _str_match(rec["name"]) |
| 199 | return _name |
| 200 | |
| 201 | if key == "qualified_name": |
| 202 | def _qname(file_path: str, rec: SymbolRecord) -> bool: |
| 203 | return _str_match(rec["qualified_name"]) |
| 204 | return _qname |
| 205 | |
| 206 | if key == "file": |
| 207 | def _file(file_path: str, rec: SymbolRecord) -> bool: |
| 208 | return _str_match(file_path) |
| 209 | return _file |
| 210 | |
| 211 | if key == "hash": |
| 212 | prefix = value.lower() |
| 213 | def _hash(file_path: str, rec: SymbolRecord, _p: str = prefix) -> bool: |
| 214 | return rec["content_id"].startswith(_p) |
| 215 | return _hash |
| 216 | |
| 217 | if key == "body_hash": |
| 218 | prefix = value.lower() |
| 219 | def _body_hash(file_path: str, rec: SymbolRecord, _p: str = prefix) -> bool: |
| 220 | return rec["body_hash"].startswith(_p) |
| 221 | return _body_hash |
| 222 | |
| 223 | if key == "signature_id": |
| 224 | prefix = value.lower() |
| 225 | def _sig_id(file_path: str, rec: SymbolRecord, _p: str = prefix) -> bool: |
| 226 | return rec["signature_id"].startswith(_p) |
| 227 | return _sig_id |
| 228 | |
| 229 | # ── Integer-range keys ─────────────────────────────────────────────────── |
| 230 | |
| 231 | if key == "lineno_gt": |
| 232 | threshold = _parse_int_value("lineno_gt", value) |
| 233 | def _lineno_gt(file_path: str, rec: SymbolRecord, _t: int = threshold) -> bool: |
| 234 | return rec["lineno"] > _t |
| 235 | return _lineno_gt |
| 236 | |
| 237 | if key == "lineno_lt": |
| 238 | threshold = _parse_int_value("lineno_lt", value) |
| 239 | def _lineno_lt(file_path: str, rec: SymbolRecord, _t: int = threshold) -> bool: |
| 240 | return rec["lineno"] < _t |
| 241 | return _lineno_lt |
| 242 | |
| 243 | if key == "size_gt": |
| 244 | threshold = _parse_int_value("size_gt", value) |
| 245 | def _size_gt(file_path: str, rec: SymbolRecord, _t: int = threshold) -> bool: |
| 246 | return (rec["end_lineno"] - rec["lineno"]) > _t |
| 247 | return _size_gt |
| 248 | |
| 249 | if key == "size_lt": |
| 250 | threshold = _parse_int_value("size_lt", value) |
| 251 | def _size_lt(file_path: str, rec: SymbolRecord, _t: int = threshold) -> bool: |
| 252 | return (rec["end_lineno"] - rec["lineno"]) < _t |
| 253 | return _size_lt |
| 254 | |
| 255 | # Should be unreachable — all valid keys handled above. |
| 256 | raise PredicateError(f"Internal error: unhandled key '{key}'.") |
| 257 | |
| 258 | # --------------------------------------------------------------------------- |
| 259 | # Recursive descent parser |
| 260 | # --------------------------------------------------------------------------- |
| 261 | |
| 262 | class _Parser: |
| 263 | """Recursive descent parser for the predicate grammar.""" |
| 264 | |
| 265 | def __init__(self, tokens: list[_Token]) -> None: |
| 266 | self._tokens = tokens |
| 267 | self._pos = 0 |
| 268 | self._depth = 0 # parenthesis nesting depth |
| 269 | |
| 270 | def _peek(self) -> _Token | None: |
| 271 | if self._pos < len(self._tokens): |
| 272 | return self._tokens[self._pos] |
| 273 | return None |
| 274 | |
| 275 | def _consume(self, kind: str | None = None) -> _Token: |
| 276 | tok = self._peek() |
| 277 | if tok is None: |
| 278 | raise PredicateError("Unexpected end of predicate expression.") |
| 279 | if kind is not None and tok.kind != kind: |
| 280 | raise PredicateError( |
| 281 | f"Expected {kind!r} at position {tok.pos}, got {tok.kind!r} ({tok.value!r})." |
| 282 | ) |
| 283 | self._pos += 1 |
| 284 | return tok |
| 285 | |
| 286 | def parse(self) -> Predicate: |
| 287 | pred = self._parse_or() |
| 288 | if self._peek() is not None: |
| 289 | tok = self._peek() |
| 290 | assert tok is not None |
| 291 | raise PredicateError( |
| 292 | f"Unexpected token at position {tok.pos}: {tok.value!r}" |
| 293 | ) |
| 294 | return pred |
| 295 | |
| 296 | def _parse_or(self) -> Predicate: |
| 297 | left = self._parse_and() |
| 298 | peek = self._peek() |
| 299 | while peek is not None and peek.kind == "OR": |
| 300 | self._consume("OR") |
| 301 | right = self._parse_and() |
| 302 | left_cap = left |
| 303 | right_cap = right |
| 304 | def _or(fp: str, rec: SymbolRecord, _l: Predicate = left_cap, _r: Predicate = right_cap) -> bool: |
| 305 | return _l(fp, rec) or _r(fp, rec) |
| 306 | left = _or |
| 307 | peek = self._peek() |
| 308 | return left |
| 309 | |
| 310 | def _parse_and(self) -> Predicate: |
| 311 | left = self._parse_not() |
| 312 | while True: |
| 313 | tok = self._peek() |
| 314 | # Continue AND-chaining: next token is ATOM, LPAREN, or explicit AND. |
| 315 | if tok is None: |
| 316 | break |
| 317 | if tok.kind in ("RPAREN", "OR"): |
| 318 | break |
| 319 | if tok.kind == "AND": |
| 320 | self._consume("AND") |
| 321 | right = self._parse_not() |
| 322 | left_cap = left |
| 323 | right_cap = right |
| 324 | def _and(fp: str, rec: SymbolRecord, _l: Predicate = left_cap, _r: Predicate = right_cap) -> bool: |
| 325 | return _l(fp, rec) and _r(fp, rec) |
| 326 | left = _and |
| 327 | return left |
| 328 | |
| 329 | def _parse_not(self) -> Predicate: |
| 330 | peek = self._peek() |
| 331 | if peek is not None and peek.kind == "NOT": |
| 332 | self._consume("NOT") |
| 333 | inner = self._parse_primary() |
| 334 | inner_cap = inner |
| 335 | def _not(fp: str, rec: SymbolRecord, _i: Predicate = inner_cap) -> bool: |
| 336 | return not _i(fp, rec) |
| 337 | return _not |
| 338 | return self._parse_primary() |
| 339 | |
| 340 | def _parse_primary(self) -> Predicate: |
| 341 | tok = self._peek() |
| 342 | if tok is None: |
| 343 | raise PredicateError("Unexpected end of expression — expected predicate or '('.") |
| 344 | if tok.kind == "LPAREN": |
| 345 | self._depth += 1 |
| 346 | if self._depth > _MAX_DEPTH: |
| 347 | raise PredicateError( |
| 348 | f"Predicate expression nested too deeply" |
| 349 | f" (max {_MAX_DEPTH} levels)." |
| 350 | ) |
| 351 | self._consume("LPAREN") |
| 352 | inner = self._parse_or() |
| 353 | self._consume("RPAREN") |
| 354 | self._depth -= 1 |
| 355 | return inner |
| 356 | if tok.kind == "ATOM": |
| 357 | self._consume("ATOM") |
| 358 | return _parse_atom(tok.value) |
| 359 | raise PredicateError( |
| 360 | f"Unexpected token at position {tok.pos}: {tok.value!r}. " |
| 361 | "Expected a predicate (key=value) or '('." |
| 362 | ) |
| 363 | |
| 364 | # --------------------------------------------------------------------------- |
| 365 | # Public API |
| 366 | # --------------------------------------------------------------------------- |
| 367 | |
| 368 | def parse_query(tokens_or_str: str | list[str]) -> Predicate: |
| 369 | """Parse a predicate query expression into a single callable. |
| 370 | |
| 371 | Args: |
| 372 | tokens_or_str: Either a single query string (which may contain OR/NOT/ |
| 373 | parentheses) or a list of strings that are AND'd together |
| 374 | (legacy multi-argument style). |
| 375 | |
| 376 | Returns: |
| 377 | A ``Predicate`` callable ``(file_path, SymbolRecord) -> bool``. |
| 378 | |
| 379 | Raises: |
| 380 | PredicateError: If the query cannot be parsed. |
| 381 | """ |
| 382 | if isinstance(tokens_or_str, list): |
| 383 | # Legacy: list of atoms → implicit AND of all. |
| 384 | if not tokens_or_str: |
| 385 | # Match everything. |
| 386 | return lambda _fp, _rec: True |
| 387 | combined = " ".join(tokens_or_str) |
| 388 | else: |
| 389 | combined = tokens_or_str |
| 390 | |
| 391 | combined = combined.strip() |
| 392 | if not combined: |
| 393 | return lambda _fp, _rec: True |
| 394 | |
| 395 | tokens = _tokenise(combined) |
| 396 | if not tokens: |
| 397 | return lambda _fp, _rec: True |
| 398 | |
| 399 | return _Parser(tokens).parse() |
File History
4 commits
sha256:81ae324db5ad375fbfe4834c6fcb378312cafad3cc92dec5d3e5c427306621a2
fix: remove commit_exists filter from have anchors — server…
Sonnet 4.6
patch
21 days ago
sha256:36c3cb3e76619d4c30a6d9bf81b5ec4ff148e30dcfed913e3114ca7b43b81c7e
fix: rename objects→blobs in push client and all stale test…
Sonnet 4.6
patch
23 days ago
sha256:c06a9b9b9fee26c68ea725b44d54b2c0a171301ce9de746d5b656617b4463a9a
fix: repair four test failures from post-migration audit
Sonnet 4.6
patch
29 days ago
sha256:1900655993c83c4107067375548a7be823e471d2515830842f1a12cba4bd3cdf
fix: unified object store migration — idempotent writes, JS…
Sonnet 4.6
minor
⚠
29 days ago