gabriel / muse public
_predicate.py python
399 lines 13.4 KB
Raw
sha256:81ae324db5ad375fbfe4834c6fcb378312cafad3cc92dec5d3e5c427306621a2 fix: remove commit_exists filter from have anchors — server… Sonnet 4.6 patch 21 days ago
1 """Predicate DSL parser for ``muse query`` and ``muse query-history``.
2
3 Grammar (v2)
4 ============
5
6 .. code-block:: text
7
8 expr = or_expr
9 or_expr = and_expr ( "OR" and_expr )*
10 and_expr = not_expr ( and_expr )* # implicit AND / explicit AND
11 not_expr = "NOT" primary | primary
12 primary = "(" expr ")" | atom
13 atom = KEY OP VALUE
14
15 KEY = [a-zA-Z_][a-zA-Z_0-9]*
16 OP = "~=" | "^=" | "$=" | ">=" | "<=" | "!=" | "="
17 VALUE = double-quoted-string | bare-word
18
19 Supported keys
20 --------------
21
22 Snapshot-local (available with or without ``--all-commits``):
23
24 kind function | class | method | variable | import | …
25 language Python | Go | Rust | …
26 name bare symbol name
27 qualified_name dotted name (e.g. User.save)
28 file file path
29 hash content_id prefix (exact-body match)
30 body_hash body_hash prefix
31 signature_id signature_id prefix
32 lineno_gt symbol starts after line N (integer)
33 lineno_lt symbol starts before line N (integer)
34
35 Example queries
36 ---------------
37
38 kind=function language=Python name~=validate
39 (kind=function OR kind=method) name^=_
40 NOT kind=import file~=billing
41 kind=class name~=Service language=Python
42 lineno_gt=100 lineno_lt=200 file=src/billing.py
43
44 The parser is a hand-written recursive descent parser — no external
45 dependencies, no regex-based hacks. All parsing errors raise
46 ``PredicateError`` with a human-readable message including the position
47 in the input string.
48
49 New keys (v2.1)
50 ---------------
51
52 size_gt symbol body exceeds N lines (end_lineno - lineno > N)
53 size_lt symbol body shorter than N lines (end_lineno - lineno < N)
54
55 Example::
56
57 kind=function size_gt=50 # oversized functions (refactor targets)
58 kind=function size_lt=3 NOT name^=_ # suspiciously tiny public functions
59
60 Security note
61 -------------
62 Expression nesting is capped at ``_MAX_DEPTH`` (64) levels to prevent
63 stack overflow from crafted predicate strings.
64 """
65
66 import re
67 import logging
68 from collections.abc import Callable
69
70 from muse.plugins.code._query import language_of
71 from muse.plugins.code.ast_parser import SymbolRecord
72
73 logger = logging.getLogger(__name__)
74
75 # Maximum parenthesis nesting depth — prevents stack overflow from crafted input.
76 _MAX_DEPTH = 64
77
78 # Signature: (file_path: str, rec: SymbolRecord) -> bool
79 Predicate = Callable[[str, SymbolRecord], bool]
80
81 class PredicateError(ValueError):
82 """Raised when a predicate string cannot be parsed or evaluated."""
83
84 # ---------------------------------------------------------------------------
85 # Tokeniser
86 # ---------------------------------------------------------------------------
87
88 _TOKEN_SPEC = [
89 ("LPAREN", r"\("),
90 ("RPAREN", r"\)"),
91 ("OR", r"\bOR\b"),
92 ("NOT", r"\bNOT\b"),
93 ("AND", r"\bAND\b"),
94 ("ATOM", r'[a-zA-Z_][a-zA-Z_0-9]*(?:~=|\^=|\$=|>=|<=|!=|=)"[^"]*"'),
95 ("ATOM", r'[a-zA-Z_][a-zA-Z_0-9]*(?:~=|\^=|\$=|>=|<=|!=|=)[^\s()]+'),
96 ("WS", r"\s+"),
97 ]
98
99 _TOKEN_RE = re.compile("|".join(f"(?P<{name}_{i}>{pat})" for i, (name, pat) in enumerate(_TOKEN_SPEC)))
100
101 class _Token:
102 __slots__ = ("kind", "value", "pos")
103
104 def __init__(self, kind: str, value: str, pos: int) -> None:
105 self.kind = kind
106 self.value = value
107 self.pos = pos
108
109 def __repr__(self) -> str:
110 return f"Token({self.kind!r}, {self.value!r})"
111
112 def _tokenise(text: str) -> list[_Token]:
113 tokens: list[_Token] = []
114 pos = 0
115 while pos < len(text):
116 m = _TOKEN_RE.match(text, pos)
117 if m is None:
118 raise PredicateError(
119 f"Unexpected character at position {pos}: {text[pos]!r}"
120 )
121 kind_raw = m.lastgroup or ""
122 # Strip the numeric suffix we added.
123 kind = kind_raw.rsplit("_", 1)[0]
124 if kind != "WS":
125 tokens.append(_Token(kind, m.group(), pos))
126 pos = m.end()
127 return tokens
128
129 # ---------------------------------------------------------------------------
130 # Atom parser
131 # ---------------------------------------------------------------------------
132
133 _OP_RE = re.compile(r"(~=|\^=|\$=|>=|<=|!=|=)")
134 _VALID_KEYS = frozenset({
135 "kind", "language", "name", "qualified_name", "file",
136 "hash", "body_hash", "signature_id",
137 "lineno_gt", "lineno_lt",
138 "size_gt", "size_lt",
139 })
140
141 def _parse_int_value(key: str, value: str) -> int:
142 """Parse *value* as an integer, raising ``PredicateError`` on failure."""
143 try:
144 return int(value)
145 except ValueError:
146 raise PredicateError(f"{key} requires an integer value (got '{value}').")
147
148 def _parse_atom(atom_str: str) -> Predicate:
149 """Parse a single ``key OP value`` atom into a predicate callable."""
150 m = _OP_RE.search(atom_str)
151 if m is None:
152 raise PredicateError(
153 f"Cannot parse predicate '{atom_str}'. "
154 "Expected: key=value, key~=value, key^=value, key$=value, key!=value."
155 )
156 op = m.group(1)
157 key = atom_str[:m.start()].strip()
158 value = atom_str[m.end():]
159 # Strip surrounding double quotes.
160 if value.startswith('"') and value.endswith('"'):
161 value = value[1:-1]
162 if key not in _VALID_KEYS:
163 raise PredicateError(
164 f"Unknown predicate key '{key}'. "
165 f"Valid keys: {', '.join(sorted(_VALID_KEYS))}."
166 )
167
168 value_lower = value.lower()
169
170 # ── String-match keys ────────────────────────────────────────────────────
171
172 def _str_match(field: str) -> bool:
173 f = field.lower()
174 if op == "=":
175 return f == value_lower
176 if op == "~=":
177 return value_lower in f
178 if op == "^=":
179 return f.startswith(value_lower)
180 if op == "$=":
181 return f.endswith(value_lower)
182 if op == "!=":
183 return f != value_lower
184 return False
185
186 if key == "kind":
187 def _kind(file_path: str, rec: SymbolRecord) -> bool:
188 return _str_match(rec["kind"])
189 return _kind
190
191 if key == "language":
192 def _language(file_path: str, rec: SymbolRecord) -> bool:
193 return _str_match(language_of(file_path))
194 return _language
195
196 if key == "name":
197 def _name(file_path: str, rec: SymbolRecord) -> bool:
198 return _str_match(rec["name"])
199 return _name
200
201 if key == "qualified_name":
202 def _qname(file_path: str, rec: SymbolRecord) -> bool:
203 return _str_match(rec["qualified_name"])
204 return _qname
205
206 if key == "file":
207 def _file(file_path: str, rec: SymbolRecord) -> bool:
208 return _str_match(file_path)
209 return _file
210
211 if key == "hash":
212 prefix = value.lower()
213 def _hash(file_path: str, rec: SymbolRecord, _p: str = prefix) -> bool:
214 return rec["content_id"].startswith(_p)
215 return _hash
216
217 if key == "body_hash":
218 prefix = value.lower()
219 def _body_hash(file_path: str, rec: SymbolRecord, _p: str = prefix) -> bool:
220 return rec["body_hash"].startswith(_p)
221 return _body_hash
222
223 if key == "signature_id":
224 prefix = value.lower()
225 def _sig_id(file_path: str, rec: SymbolRecord, _p: str = prefix) -> bool:
226 return rec["signature_id"].startswith(_p)
227 return _sig_id
228
229 # ── Integer-range keys ───────────────────────────────────────────────────
230
231 if key == "lineno_gt":
232 threshold = _parse_int_value("lineno_gt", value)
233 def _lineno_gt(file_path: str, rec: SymbolRecord, _t: int = threshold) -> bool:
234 return rec["lineno"] > _t
235 return _lineno_gt
236
237 if key == "lineno_lt":
238 threshold = _parse_int_value("lineno_lt", value)
239 def _lineno_lt(file_path: str, rec: SymbolRecord, _t: int = threshold) -> bool:
240 return rec["lineno"] < _t
241 return _lineno_lt
242
243 if key == "size_gt":
244 threshold = _parse_int_value("size_gt", value)
245 def _size_gt(file_path: str, rec: SymbolRecord, _t: int = threshold) -> bool:
246 return (rec["end_lineno"] - rec["lineno"]) > _t
247 return _size_gt
248
249 if key == "size_lt":
250 threshold = _parse_int_value("size_lt", value)
251 def _size_lt(file_path: str, rec: SymbolRecord, _t: int = threshold) -> bool:
252 return (rec["end_lineno"] - rec["lineno"]) < _t
253 return _size_lt
254
255 # Should be unreachable — all valid keys handled above.
256 raise PredicateError(f"Internal error: unhandled key '{key}'.")
257
258 # ---------------------------------------------------------------------------
259 # Recursive descent parser
260 # ---------------------------------------------------------------------------
261
262 class _Parser:
263 """Recursive descent parser for the predicate grammar."""
264
265 def __init__(self, tokens: list[_Token]) -> None:
266 self._tokens = tokens
267 self._pos = 0
268 self._depth = 0 # parenthesis nesting depth
269
270 def _peek(self) -> _Token | None:
271 if self._pos < len(self._tokens):
272 return self._tokens[self._pos]
273 return None
274
275 def _consume(self, kind: str | None = None) -> _Token:
276 tok = self._peek()
277 if tok is None:
278 raise PredicateError("Unexpected end of predicate expression.")
279 if kind is not None and tok.kind != kind:
280 raise PredicateError(
281 f"Expected {kind!r} at position {tok.pos}, got {tok.kind!r} ({tok.value!r})."
282 )
283 self._pos += 1
284 return tok
285
286 def parse(self) -> Predicate:
287 pred = self._parse_or()
288 if self._peek() is not None:
289 tok = self._peek()
290 assert tok is not None
291 raise PredicateError(
292 f"Unexpected token at position {tok.pos}: {tok.value!r}"
293 )
294 return pred
295
296 def _parse_or(self) -> Predicate:
297 left = self._parse_and()
298 peek = self._peek()
299 while peek is not None and peek.kind == "OR":
300 self._consume("OR")
301 right = self._parse_and()
302 left_cap = left
303 right_cap = right
304 def _or(fp: str, rec: SymbolRecord, _l: Predicate = left_cap, _r: Predicate = right_cap) -> bool:
305 return _l(fp, rec) or _r(fp, rec)
306 left = _or
307 peek = self._peek()
308 return left
309
310 def _parse_and(self) -> Predicate:
311 left = self._parse_not()
312 while True:
313 tok = self._peek()
314 # Continue AND-chaining: next token is ATOM, LPAREN, or explicit AND.
315 if tok is None:
316 break
317 if tok.kind in ("RPAREN", "OR"):
318 break
319 if tok.kind == "AND":
320 self._consume("AND")
321 right = self._parse_not()
322 left_cap = left
323 right_cap = right
324 def _and(fp: str, rec: SymbolRecord, _l: Predicate = left_cap, _r: Predicate = right_cap) -> bool:
325 return _l(fp, rec) and _r(fp, rec)
326 left = _and
327 return left
328
329 def _parse_not(self) -> Predicate:
330 peek = self._peek()
331 if peek is not None and peek.kind == "NOT":
332 self._consume("NOT")
333 inner = self._parse_primary()
334 inner_cap = inner
335 def _not(fp: str, rec: SymbolRecord, _i: Predicate = inner_cap) -> bool:
336 return not _i(fp, rec)
337 return _not
338 return self._parse_primary()
339
340 def _parse_primary(self) -> Predicate:
341 tok = self._peek()
342 if tok is None:
343 raise PredicateError("Unexpected end of expression — expected predicate or '('.")
344 if tok.kind == "LPAREN":
345 self._depth += 1
346 if self._depth > _MAX_DEPTH:
347 raise PredicateError(
348 f"Predicate expression nested too deeply"
349 f" (max {_MAX_DEPTH} levels)."
350 )
351 self._consume("LPAREN")
352 inner = self._parse_or()
353 self._consume("RPAREN")
354 self._depth -= 1
355 return inner
356 if tok.kind == "ATOM":
357 self._consume("ATOM")
358 return _parse_atom(tok.value)
359 raise PredicateError(
360 f"Unexpected token at position {tok.pos}: {tok.value!r}. "
361 "Expected a predicate (key=value) or '('."
362 )
363
364 # ---------------------------------------------------------------------------
365 # Public API
366 # ---------------------------------------------------------------------------
367
368 def parse_query(tokens_or_str: str | list[str]) -> Predicate:
369 """Parse a predicate query expression into a single callable.
370
371 Args:
372 tokens_or_str: Either a single query string (which may contain OR/NOT/
373 parentheses) or a list of strings that are AND'd together
374 (legacy multi-argument style).
375
376 Returns:
377 A ``Predicate`` callable ``(file_path, SymbolRecord) -> bool``.
378
379 Raises:
380 PredicateError: If the query cannot be parsed.
381 """
382 if isinstance(tokens_or_str, list):
383 # Legacy: list of atoms → implicit AND of all.
384 if not tokens_or_str:
385 # Match everything.
386 return lambda _fp, _rec: True
387 combined = " ".join(tokens_or_str)
388 else:
389 combined = tokens_or_str
390
391 combined = combined.strip()
392 if not combined:
393 return lambda _fp, _rec: True
394
395 tokens = _tokenise(combined)
396 if not tokens:
397 return lambda _fp, _rec: True
398
399 return _Parser(tokens).parse()
File History 4 commits
sha256:81ae324db5ad375fbfe4834c6fcb378312cafad3cc92dec5d3e5c427306621a2 fix: remove commit_exists filter from have anchors — server… Sonnet 4.6 patch 21 days ago
sha256:36c3cb3e76619d4c30a6d9bf81b5ec4ff148e30dcfed913e3114ca7b43b81c7e fix: rename objects→blobs in push client and all stale test… Sonnet 4.6 patch 23 days ago
sha256:c06a9b9b9fee26c68ea725b44d54b2c0a171301ce9de746d5b656617b4463a9a fix: repair four test failures from post-migration audit Sonnet 4.6 patch 29 days ago
sha256:1900655993c83c4107067375548a7be823e471d2515830842f1a12cba4bd3cdf fix: unified object store migration — idempotent writes, JS… Sonnet 4.6 minor 29 days ago