gabriel / muse public
typing_audit.py python
878 lines 33.3 KB
Raw
sha256:51116ec824246acde6abf729e6ba854c223dc5173eff31a645520208023b0652 refactor(bridge): comprehensive spec sweep — close all issu… Sonnet 4.6 minor ⚠ breaking 28 days ago
1 #!/usr/bin/env python3
2 """Typing audit — zero-tolerance type-safety enforcement for mission-critical code.
3
4 Every banned pattern maps to a future Rust port liability: if Python cannot
5 name a type, ``rustc`` cannot either. The ratchet keeps the rule enforced
6 continuously so violations never accumulate.
7
8 Patterns checked
9 ----------------
10 *Any-as-type* — ``dict[str, Any]``, ``list[Any]``, ``type[Any]``,
11 ``Any | X``, ``X | Any``, ``Mapping[str, Any]``, etc.
12
13 *object-as-type* — same severity as Any; erases all structural information.
14
15 *cast()* — all usage banned; it conceals a broken callee return type.
16
17 *# type: ignore* — every suppressed error is an unaudited assumption.
18
19 *Bare collections* — ``list``, ``dict``, ``set``, ``tuple`` without ``[T]``.
20
21 *Optional[X]* and *Union[X, Y]* — use ``X | None`` and ``X | Y`` (PEP 604).
22
23 *Legacy typing imports* — ``List``, ``Dict``, ``Set``, ``Tuple``.
24
25 *Bare Callable / Callable returning Any* — must carry a full signature.
26
27 *Untyped varargs* — ``*args: Any``, ``**kwargs: Any``, and unannotated
28 ``*args`` / ``**kwargs`` (annotation absent entirely).
29
30 *Untyped function definitions* — missing return or parameter annotation.
31
32 *Unconstrained TypeVar* — ``TypeVar(...)`` with no ``bound=`` and no
33 constraint arguments; behaves identically to ``Any`` in practice.
34
35 *Naked dict at boundary* — ``dict[str, X]`` as a parameter or return type
36 is banned at function/method boundaries. Every dict with known keys must
37 be a ``TypedDict``; every dict with dynamic keys must justify its key space.
38 The only valid ``dict[str, ...]`` at a boundary is an explicitly named
39 ``TypedDict`` subclass. This rule exists because ``rustc`` cannot infer
40 struct fields from a ``HashMap<String, X>`` — named fields must be declared.
41 Pattern ``boundary_dict`` fires on ``: dict[str,`` and ``-> dict[str,``.
42
43 *Anonymous dict in collection* — ``list[dict[str, X]]``, ``dict[str, dict[str, X]]``,
44 ``tuple[dict[str, X], ...]``. An anonymous dict nested inside a collection is
45 always a named struct waiting to be declared. Use a ``TypedDict`` subclass or a
46 named type alias (e.g. ``list[JSONObject]``, ``list[SymbolHistoryEntry]``).
47
48 Named type aliases do NOT trigger this rule — only the literal expansion does.
49 This is by design: ``list[JSONObject]`` is fine; ``list[dict[str, JSONValue]]`` is not.
50 Rust requires every struct field to be named; ``Vec<HashMap<String, Value>>`` is
51 never the right answer when ``Vec<SymbolEntry>`` is possible.
52
53 ``concrete_dict_in_list`` — fires on ``list[dict[str,``, ``tuple[dict[str,``,
54 ``set[dict[str,``
55 ``dict_of_dict`` — fires on ``dict[str, dict[str,``
56
57 Usage::
58
59 python tools/typing_audit.py # muse/ + tests/
60 python tools/typing_audit.py --dirs muse/ tests/
61 python tools/typing_audit.py --dirs muse/ --max-any 0 --max-untyped 0
62 python tools/typing_audit.py --json artifacts/typing_audit.json
63 """
64
65 from __future__ import annotations
66
67 import argparse
68 import ast
69 import io
70 import json
71 import operator
72 import re
73 import sys
74 import tokenize
75 from collections import defaultdict
76 from pathlib import Path
77 from typing import TypedDict
78
79 # ---------------------------------------------------------------------------
80 # Type aliases — avoid dict[str, X] at function/class-field boundaries.
81 # ---------------------------------------------------------------------------
82
83 type PatternCounts = dict[str, int]
84 type PatternLines = dict[str, list[int]]
85 type PatternMap = dict[str, re.Pattern[str]]
86 type PerFileViolations = dict[str, PatternCounts]
87
88
89 class Violation(TypedDict):
90 """A single typed violation — one pattern match at one source location."""
91
92 file: str
93 line: int
94 kind: str
95
96 # ---------------------------------------------------------------------------
97 # Data shapes — TypedDicts replace every dict[str, Any] in the old script.
98 # All shapes mirror the Rust struct that will eventually own them.
99 # ---------------------------------------------------------------------------
100
101
102 class UntypedDef(TypedDict):
103 """A function or method that is missing a required type annotation.
104
105 ``issue`` is one of:
106
107 - ``"missing_return_type"`` — no return annotation.
108 - ``"missing_param_type"`` — a non-self/cls parameter lacks annotation.
109 - ``"untyped_args"`` — ``*args`` is annotated as ``Any`` or has
110 no annotation at all.
111 - ``"untyped_kwargs"`` — ``**kwargs`` is annotated as ``Any`` or has
112 no annotation at all.
113 - ``"unconstrained_typevar"``— a ``TypeVar`` with no ``bound=`` and no
114 positional constraints.
115 """
116
117 file: str
118 line: int
119 name: str
120 issue: str
121
122
123 class FileResult(TypedDict):
124 """Typing-violation summary for a single Python source file."""
125
126 file: str
127 imports_any: bool
128 patterns: PatternCounts
129 pattern_lines: PatternLines
130 type_ignore_variants: PatternCounts
131 untyped_defs: list[UntypedDef]
132
133
134 class Offender(TypedDict):
135 """A file with at least one typing violation, ranked by total count."""
136
137 file: str
138 total: int
139 patterns: PatternCounts
140
141
142 class ReportSummary(TypedDict):
143 """High-level aggregate counts for the entire scan."""
144
145 total_files_scanned: int
146 files_importing_any: int
147 total_any_patterns: int
148 untyped_defs: int
149
150
151 class Report(TypedDict):
152 """Full typing-audit report produced by :func:`generate_report`."""
153
154 summary: ReportSummary
155 pattern_totals: PatternCounts
156 type_ignore_variants: PatternCounts
157 top_offenders: list[Offender]
158 per_file: PerFileViolations
159 violations: list[Violation]
160 untyped_defs: list[UntypedDef]
161
162
163 # ---------------------------------------------------------------------------
164 # String-literal masking
165 # ---------------------------------------------------------------------------
166
167
168 def _mask_string_literals(source: str) -> str:
169 """Replace string-literal content with spaces, preserving newlines.
170
171 Pattern matching runs on the masked source so that raw regex strings,
172 docstrings, and string constants never produce false positives. All
173 newlines are preserved so that line numbers stay accurate.
174
175 Tokenisation errors (e.g. incomplete source snippets) are silently
176 ignored — the original source is returned unchanged so the caller still
177 produces *some* output rather than silently dropping the file.
178
179 Args:
180 source: Full UTF-8 source text of a Python file.
181
182 Returns:
183 A copy of *source* with the content of every string token replaced
184 by space characters (newlines within multi-line strings preserved).
185 """
186 chars = list(source)
187 lines = source.splitlines(keepends=True)
188
189 # Pre-compute cumulative line offsets for O(1) (row, col) → offset.
190 offsets: list[int] = [0]
191 for ln in lines:
192 offsets.append(offsets[-1] + len(ln))
193
194 def _abs(row: int, col: int) -> int:
195 return offsets[row - 1] + col
196
197 # Token types that contain string literal content — including f-string
198 # middle segments which are FSTRING_MIDDLE (not STRING) in Python 3.12+.
199 _FSTRING_MIDDLE = getattr(tokenize, "FSTRING_MIDDLE", None)
200 _STRING_TYPES = {tokenize.STRING}
201 if _FSTRING_MIDDLE is not None:
202 _STRING_TYPES.add(_FSTRING_MIDDLE)
203
204 try:
205 gen = tokenize.generate_tokens(io.StringIO(source).readline)
206 for tok_type, _tok_str, (srow, scol), (erow, ecol), _ in gen:
207 if tok_type not in _STRING_TYPES:
208 continue
209 start = _abs(srow, scol)
210 end = _abs(erow, ecol)
211 for i in range(start, end):
212 if chars[i] not in {"\n", "\r"}:
213 chars[i] = " "
214 except tokenize.TokenError:
215 pass
216
217 return "".join(chars)
218
219
220 # ---------------------------------------------------------------------------
221 # Pattern registry
222 # ---------------------------------------------------------------------------
223
224 #: All patterns that count toward the violation total.
225 #: Keys are stable identifiers used in JSON output and tests.
226 #:
227 #: NOTE: do NOT use re.IGNORECASE — Python type annotations are case-sensitive.
228 #: ``List`` and ``list`` are distinct identifiers; matching ``list[any]``
229 #: (where ``any`` is the built-in function) would be a false positive.
230 _PATTERNS: PatternMap = {
231 # Any-as-type ─────────────────────────────────────────────────────────
232 "dict_str_any": re.compile(r"\bdict\[str,\s*Any\]|\bDict\[str,\s*Any\]"),
233 "list_any": re.compile(r"\blist\[Any\]|\bList\[Any\]"),
234 "type_any": re.compile(r"\btype\[Any\]"),
235 "any_in_union": re.compile(r"\bAny\s*\||\|\s*Any\b"),
236 "return_any": re.compile(r"->\s*Any\b"),
237 "param_any": re.compile(r":\s*Any\b"),
238 "mapping_any": re.compile(r"\bMapping\[str,\s*Any\]"),
239 "optional_any": re.compile(r"\bOptional\[Any\]"),
240 "sequence_any": re.compile(r"\bSequence\[Any\]|\bIterable\[Any\]"),
241 "tuple_any": re.compile(r"\btuple\[[^\n]*Any[^\n]*\]|\bTuple\[[^\n]*Any[^\n]*\]"),
242 # object-as-type ──────────────────────────────────────────────────────
243 "param_object": re.compile(r":\s*object\b"),
244 "return_object": re.compile(r"->\s*object\b"),
245 # Handles one level of nesting, e.g. dict[str, list[object]].
246 # NOTE: Mapping is intentionally excluded — Mapping[str, object] is the
247 # correct type for read-only, covariant mappings at framework boundaries
248 # (e.g. Jinja2 template contexts). Mapping[str, Any] is caught separately
249 # by mapping_any. Only mutable collection types need this guard.
250 "collection_object": re.compile(
251 r"\b(?:dict|list|set|tuple|Sequence)"
252 r"\[[^\n\[\]]*(?:\[[^\n\[\]]*\][^\n\[\]]*)*\bobject\b"
253 ),
254 # cast() — banned ─────────────────────────────────────────────────────
255 "cast_usage": re.compile(r"(?<![.\w])cast\("),
256 # type: ignore — only flag blanket suppresses (no specific error code).
257 # ``# type: ignore[some-code]`` is acceptable when the exact issue is known;
258 # ``# type: ignore`` with no code is a blind suppression and always banned.
259 "type_ignore": re.compile(r"#\s*type:\s*ignore(?!\s*\[)"),
260 # Bare collections (no type parameters) ───────────────────────────────
261 # Negative lookaheads exclude parameterised forms and prose.
262 "bare_list": re.compile(r"(?::\s*|->\s*)list\b(?!\[|\(|\s+[a-z])"),
263 "bare_dict": re.compile(r"(?::\s*|->\s*)dict\b(?!\[|\(|\s+[a-z])"),
264 "bare_set": re.compile(r"(?::\s*|->\s*)set\b(?!\[|\(|\s+[a-z])"),
265 "bare_tuple": re.compile(r"(?::\s*|->\s*)tuple\b(?!\[|\(|\s+[a-z])"),
266 # Optional[X] — use X | None (PEP 604) ────────────────────────────────
267 "optional_usage": re.compile(r"\bOptional\[(?!Any\b)"),
268 # Union[X, Y] — use X | Y (PEP 604) ──────────────────────────────────
269 "union_usage": re.compile(r"\bUnion\["),
270 # Legacy typing imports (use lowercase builtins) ──────────────────────
271 "legacy_List": re.compile(r"\bList\["),
272 "legacy_Dict": re.compile(r"\bDict\["),
273 "legacy_Set": re.compile(r"\bSet\["),
274 "legacy_Tuple": re.compile(r"\bTuple\["),
275 # Callable — must carry full signature ────────────────────────────────
276 "bare_callable": re.compile(r"(?::\s*|->\s*)Callable\b(?!\[)"),
277 "callable_any": re.compile(r"\bCallable\[[^\n]*,\s*Any\s*\]"),
278 # Untyped varargs — *args: Any / **kwargs: Any ────────────────────────
279 # Unannotated *args/**kwargs are caught by the AST walker instead.
280 "varargs_any": re.compile(r"\*{1,2}\w+:\s*Any\b"),
281 # Naked dict at boundary — dict[str, X] as param/return type is banned.
282 # Every structured boundary must use a TypedDict (or dataclass/enum).
283 # Matches ": dict[str," and "-> dict[str," — the two annotation positions.
284 #
285 # APPROVED alternatives at boundaries:
286 # - ReadOnlyJSONObject (= Mapping[str, JSONValue]) for read-only JSON params
287 # - A named TypedDict subclass for any dict with statically known keys
288 #
289 # Mapping[str, JSONValue] is covariant so any dict[str, T where T ⊆ JSONValue]
290 # is assignable to it. This pattern (boundary_dict) does NOT fire on
291 # Mapping[...]; mapping_any does NOT fire on Mapping[str, JSONValue].
292 # Therefore Mapping[str, JSONValue] is the safe boundary form for JSON dicts.
293 "boundary_dict": re.compile(r"(?::\s*|->\s*)dict\[str\s*,"),
294 # Anonymous dict in collection — list[dict[str, X]] / dict[str, dict[str, X]].
295 # A dict nested inside a collection is always a named struct opportunity.
296 # Use a TypedDict subclass or a named type alias (e.g. list[JSONObject]).
297 # Named aliases do NOT trigger this rule — only the literal expansion does.
298 # This is intentional: list[JSONObject] is fine; list[dict[str, JSONValue]] is not.
299 "concrete_dict_in_list": re.compile(
300 r"\b(?:list|tuple|set)\[dict\[str,"
301 ),
302 "dict_of_dict": re.compile(
303 r"\bdict\[str,\s*dict\[str,"
304 ),
305 }
306
307 # Category groupings for the human-readable report, in display order.
308 _CATEGORY_ORDER: list[tuple[str, list[str]]] = [
309 ("Any-as-type", [
310 "dict_str_any", "list_any", "type_any", "any_in_union",
311 "return_any", "param_any",
312 "mapping_any", "optional_any", "sequence_any", "tuple_any",
313 ]),
314 ("object-as-type", ["param_object", "return_object", "collection_object"]),
315 ("cast() usage", ["cast_usage"]),
316 ("type: ignore", ["type_ignore"]),
317 ("Bare collections", ["bare_list", "bare_dict", "bare_set", "bare_tuple"]),
318 ("Optional (use X | None)", ["optional_usage"]),
319 ("Union (use X | Y)", ["union_usage"]),
320 ("Legacy typing imports", ["legacy_List", "legacy_Dict", "legacy_Set", "legacy_Tuple"]),
321 ("Callable (must carry full signature)", ["bare_callable", "callable_any"]),
322 ("Untyped varargs", ["varargs_any"]),
323 ("Naked dict at boundary (use TypedDict)", ["boundary_dict"]),
324 ("Anonymous dict in collection (use TypedDict or named alias)", [
325 "concrete_dict_in_list", "dict_of_dict",
326 ]),
327 ]
328
329 # Directories that are never source code and must be skipped during scanning.
330 _SKIP_DIRS: frozenset[str] = frozenset({
331 "venv", ".venv", "env", ".env",
332 "__pycache__",
333 ".git", ".muse", ".mypy_cache", ".ruff_cache", ".pytest_cache", ".tox",
334 "dist", "build", "site-packages", "__pypackages__",
335 })
336
337
338 # ---------------------------------------------------------------------------
339 # Pattern helpers
340 # ---------------------------------------------------------------------------
341
342
343 def _count_pattern_in_line(line: str, pattern: re.Pattern[str]) -> int:
344 """Return the number of non-overlapping matches of *pattern* in *line*."""
345 return len(pattern.findall(line))
346
347
348 def _imports_any(source: str) -> bool:
349 """Return ``True`` if the source file imports ``Any`` from ``typing``
350 or ``typing_extensions``.
351
352 Excludes commented-out import lines (lines where ``from`` is preceded only
353 by ``#`` and optional whitespace).
354 """
355 return bool(re.search(
356 r"^[ \t]*from\s+typing(?:_extensions)?\s+import\s+.*\bAny\b",
357 source,
358 re.MULTILINE,
359 ))
360
361
362 def _classify_type_ignore(line: str) -> str:
363 """Classify the style of a ``# type: ignore`` comment.
364
365 Returns ``"type_ignore[code]"`` for code-specific ignores, or
366 ``"type_ignore[blanket]"`` for bare ``# type: ignore``.
367
368 Args:
369 line: A single source line that contains ``# type: ignore``.
370
371 Returns:
372 A string label for the variant.
373 """
374 m = re.search(r"#\s*type:\s*ignore\[([^\]]+)\]", line)
375 if m:
376 return f"type_ignore[{m.group(1)}]"
377 return "type_ignore[blanket]"
378
379
380 # ---------------------------------------------------------------------------
381 # AST-based detection
382 # ---------------------------------------------------------------------------
383
384
385 def _is_any_annotation(node: ast.expr | None) -> bool:
386 """Return ``True`` if *node* is the bare ``Any`` name."""
387 return isinstance(node, ast.Name) and node.id == "Any"
388
389
390 def _find_untyped_defs(source: str, filepath: str) -> list[UntypedDef]:
391 """Walk the AST and collect every function with a missing annotation.
392
393 Checks:
394
395 - Missing return type (``node.returns is None``).
396 - Missing parameter annotation (excluding ``self`` and ``cls``).
397 - ``*args`` annotated as ``Any`` **or** with no annotation at all.
398 - ``**kwargs`` annotated as ``Any`` **or** with no annotation at all.
399 - ``TypeVar(...)`` assignments with no ``bound=`` and no constraints.
400
401 Line numbers for parameter violations use the argument's own line number
402 (``arg.lineno``) rather than the function definition line, so the report
403 points directly at the problematic parameter.
404
405 Skips files that cannot be parsed.
406
407 Args:
408 source: Full source text of the file.
409 filepath: Path string used in the returned records.
410
411 Returns:
412 A list of :class:`UntypedDef` records, one per violation found.
413 """
414 results: list[UntypedDef] = []
415 try:
416 tree = ast.parse(source)
417 except SyntaxError:
418 return results
419
420 for node in ast.walk(tree):
421 if not isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
422 continue
423
424 if node.returns is None:
425 results.append(UntypedDef(
426 file=filepath,
427 line=node.lineno,
428 name=node.name,
429 issue="missing_return_type",
430 ))
431
432 all_args = (
433 node.args.args
434 + node.args.posonlyargs
435 + node.args.kwonlyargs
436 )
437 for arg in all_args:
438 if arg.arg in {"self", "cls"}:
439 continue
440 if arg.annotation is None:
441 results.append(UntypedDef(
442 file=filepath,
443 line=arg.lineno,
444 name=f"{node.name}.{arg.arg}",
445 issue="missing_param_type",
446 ))
447
448 vararg = node.args.vararg
449 if vararg is not None:
450 if vararg.annotation is None or _is_any_annotation(vararg.annotation):
451 results.append(UntypedDef(
452 file=filepath,
453 line=vararg.lineno,
454 name=f"{node.name}.*{vararg.arg}",
455 issue="untyped_args",
456 ))
457
458 kwarg = node.args.kwarg
459 if kwarg is not None:
460 if kwarg.annotation is None or _is_any_annotation(kwarg.annotation):
461 results.append(UntypedDef(
462 file=filepath,
463 line=kwarg.lineno,
464 name=f"{node.name}.**{kwarg.arg}",
465 issue="untyped_kwargs",
466 ))
467
468 # TypeVar without constraints or bound — behaves identically to Any.
469 results.extend(_find_unconstrained_typevars(tree, filepath))
470
471 return results
472
473
474 def _find_unconstrained_typevars(tree: ast.Module, filepath: str) -> list[UntypedDef]:
475 """Return a record for every ``TypeVar(...)`` with no bound or constraints.
476
477 A bare ``T = TypeVar("T")`` is semantically equivalent to ``T: Any``.
478 The Rust port requires every generic to carry an explicit trait bound.
479
480 Args:
481 tree: Parsed AST of the file.
482 filepath: Path string used in the returned records.
483
484 Returns:
485 A list of :class:`UntypedDef` records for unconstrained ``TypeVar``
486 definitions.
487 """
488 results: list[UntypedDef] = []
489 for node in ast.walk(tree):
490 # Match: T = TypeVar("T") or T = TypeVar("T", bound=...)
491 if not isinstance(node, ast.Assign):
492 continue
493 value = node.value
494 if not isinstance(value, ast.Call):
495 continue
496 func = value.func
497 if not (isinstance(func, ast.Name) and func.id == "TypeVar"):
498 continue
499 # A TypeVar is constrained when it has:
500 # - positional args beyond the name (constraint types), OR
501 # - a keyword arg named "bound"
502 extra_args = value.args[1:] # args[0] is the name string
503 kw_names = {kw.arg for kw in value.keywords}
504 if extra_args or "bound" in kw_names:
505 continue # constrained — OK
506 # Unconstrained TypeVar.
507 target_name = (
508 node.targets[0].id
509 if isinstance(node.targets[0], ast.Name)
510 else "<TypeVar>"
511 )
512 results.append(UntypedDef(
513 file=filepath,
514 line=node.lineno,
515 name=target_name,
516 issue="unconstrained_typevar",
517 ))
518 return results
519
520
521 # ---------------------------------------------------------------------------
522 # File and directory scanner
523 # ---------------------------------------------------------------------------
524
525
526 def scan_file(filepath: Path) -> FileResult | None:
527 """Scan a single Python file and return its violation summary.
528
529 String literals are masked before pattern matching so that raw regex
530 strings and docstring prose never produce false positives. The
531 ``# type: ignore`` check runs on the *original* source because those
532 comments are not string literals.
533
534 Returns ``None`` when the file cannot be read (I/O or encoding error).
535
536 Args:
537 filepath: Absolute or relative path to the Python file.
538
539 Returns:
540 A :class:`FileResult` on success, ``None`` on I/O failure.
541 """
542 try:
543 source = filepath.read_text(encoding="utf-8")
544 except (OSError, UnicodeDecodeError):
545 return None
546
547 masked = _mask_string_literals(source)
548
549 original_lines = source.splitlines()
550 masked_lines = masked.splitlines()
551
552 patterns: defaultdict[str, int] = defaultdict(int)
553 pattern_lines: defaultdict[str, list[int]] = defaultdict(list)
554 type_ignore_variants: defaultdict[str, int] = defaultdict(int)
555
556 for lineno, (orig_line, masked_line) in enumerate(
557 zip(original_lines, masked_lines), 1
558 ):
559 stripped = masked_line.strip()
560 if not stripped or stripped.startswith("#"):
561 continue
562
563 for name, pattern in _PATTERNS.items():
564 # All patterns run on the masked line — string literals are blanked
565 # so raw regex strings and docstring prose never trigger false
566 # positives. Comments are NOT masked (they are not string tokens)
567 # so "# type: ignore" on real code lines is still detected.
568 #
569 # Exception: dunder methods legitimately use `: object` (e.g.
570 # `__eq__(self, other: object)`, `__contains__(self, item: object)`).
571 # Skip param_object/return_object for those signatures.
572 if name in {"param_object", "return_object"}:
573 if re.search(r"def\s+__\w+__\s*\(", masked_line):
574 continue
575 # boundary_dict fires on `: dict[str,` and `-> dict[str,`.
576 # Local variable annotations (e.g. ``x: dict[str, int] = {}``) are
577 # NOT boundaries — only function parameter and return annotations are.
578 # `: dict[str,` on a non-def line is a local variable; skip it.
579 if name == "boundary_dict" and ": dict[str," in masked_line:
580 if not re.search(r"\bdef\b", masked_line):
581 continue
582 count = _count_pattern_in_line(masked_line, pattern)
583 if count > 0:
584 patterns[name] += count
585 pattern_lines[name].append(lineno)
586
587 if name == "type_ignore":
588 # Classify against the original line so we can distinguish
589 # blanket ignores from code-specific ones.
590 variant = _classify_type_ignore(orig_line)
591 type_ignore_variants[variant] += 1
592
593 return FileResult(
594 file=str(filepath),
595 imports_any=_imports_any(source),
596 patterns=dict(patterns),
597 pattern_lines=dict(pattern_lines),
598 type_ignore_variants=dict(type_ignore_variants),
599 untyped_defs=_find_untyped_defs(source, str(filepath)),
600 )
601
602
603 def scan_directory(directory: Path) -> list[FileResult]:
604 """Recursively scan all Python files in *directory*.
605
606 Skips virtual environments, caches, build artefacts, and VCS/tool
607 metadata directories (see ``_SKIP_DIRS``).
608
609 Args:
610 directory: Root of the directory tree to scan.
611
612 Returns:
613 A list of :class:`FileResult` objects, one per successfully scanned file.
614 """
615 results: list[FileResult] = []
616 for py_file in sorted(directory.rglob("*.py")):
617 if any(part in _SKIP_DIRS for part in py_file.parts):
618 continue
619 file_result = scan_file(py_file)
620 if file_result is not None:
621 results.append(file_result)
622 return results
623
624
625 # ---------------------------------------------------------------------------
626 # Report generation
627 # ---------------------------------------------------------------------------
628
629
630 def _offender_sort_key(entry: Offender) -> int:
631 """Return the sort key for an :class:`Offender` (total violation count)."""
632 return entry["total"]
633
634
635 def generate_report(results: list[FileResult]) -> Report:
636 """Aggregate per-file scan results into a :class:`Report`.
637
638 Args:
639 results: List of :class:`FileResult` objects from :func:`scan_file`
640 or :func:`scan_directory`.
641
642 Returns:
643 A :class:`Report` ready for human display or JSON serialisation.
644 """
645 totals: defaultdict[str, int] = defaultdict(int)
646 files_with_any_import = 0
647 per_file: PerFileViolations = {}
648 top_offenders: list[Offender] = []
649 all_type_ignore_variants: defaultdict[str, int] = defaultdict(int)
650 all_untyped_defs: list[UntypedDef] = []
651 all_violations: list[Violation] = []
652
653 for r in results:
654 filepath = r["file"]
655 if r["imports_any"]:
656 files_with_any_import += 1
657
658 file_total = 0
659 file_patterns: PatternCounts = {}
660 for pattern, count in r["patterns"].items():
661 totals[pattern] += count
662 file_patterns[pattern] = count
663 file_total += count
664 for lineno in r["pattern_lines"].get(pattern, []):
665 all_violations.append(Violation(file=filepath, line=lineno, kind=pattern))
666
667 if file_total > 0:
668 per_file[filepath] = file_patterns
669 top_offenders.append(Offender(
670 file=filepath,
671 total=file_total,
672 patterns=file_patterns,
673 ))
674
675 for variant, count in r["type_ignore_variants"].items():
676 all_type_ignore_variants[variant] += count
677
678 all_untyped_defs.extend(r["untyped_defs"])
679
680 all_violations.sort(key=lambda v: (v["file"], v["line"]))
681 top_offenders.sort(key=_offender_sort_key, reverse=True)
682
683 return Report(
684 summary=ReportSummary(
685 total_files_scanned=len(results),
686 files_importing_any=files_with_any_import,
687 total_any_patterns=sum(totals.values()),
688 untyped_defs=len(all_untyped_defs),
689 ),
690 pattern_totals=dict(totals),
691 type_ignore_variants=dict(all_type_ignore_variants),
692 # Store all offenders in JSON; display is capped separately in the
693 # human-readable printer.
694 top_offenders=top_offenders,
695 per_file=per_file,
696 violations=all_violations,
697 # Store the full list — callers that need all records can use --json.
698 untyped_defs=all_untyped_defs,
699 )
700
701
702 # ---------------------------------------------------------------------------
703 # Human-readable report printer
704 # ---------------------------------------------------------------------------
705
706
707 def print_human_summary(report: Report, top_n: int = 15) -> None:
708 """Print a formatted, human-readable summary of *report* to stdout.
709
710 Args:
711 report: A :class:`Report` produced by :func:`generate_report`.
712 top_n: How many offenders to display in the top-offenders list.
713 """
714 s = report["summary"]
715 totals = report["pattern_totals"]
716
717 print(f"\n{'=' * 70}")
718 print(" TYPING AUDIT — Violation Report")
719 print("=" * 70)
720 print(f" Files scanned: {s['total_files_scanned']}")
721 print(f" Files importing Any: {s['files_importing_any']}")
722 print(f" Total violations: {s['total_any_patterns']}")
723 print(f" Untyped defs: {s['untyped_defs']}")
724 print()
725
726 has_violations = False
727 for category, pattern_names in _CATEGORY_ORDER:
728 category_total = sum(totals.get(p, 0) for p in pattern_names)
729 if category_total == 0:
730 continue
731 has_violations = True
732 print(f" {category}:")
733 for p in pattern_names:
734 count = totals.get(p, 0)
735 if count > 0:
736 print(f" {p:38s} {count:5d}")
737 print()
738
739 if not has_violations:
740 print(" Pattern breakdown: (none)")
741 print()
742
743 if report["type_ignore_variants"]:
744 print(" # type: ignore variants:")
745 for variant, count in sorted(
746 report["type_ignore_variants"].items(),
747 key=operator.itemgetter(1),
748 reverse=True,
749 ):
750 print(f" {variant:44s} {count:5d}")
751 print()
752
753 if report["violations"]:
754 print(" Violations (file:line [kind]):")
755 for v in report["violations"]:
756 print(f" {v['file']}:{v['line']} [{v['kind']}]")
757 print()
758
759 print(f" Top {top_n} offenders:")
760 for entry in report["top_offenders"][:top_n]:
761 print(f" {entry['total']:4d} {entry['file']}")
762 print("=" * 70 + "\n")
763
764
765 # ---------------------------------------------------------------------------
766 # CLI
767 # ---------------------------------------------------------------------------
768
769
770 def main() -> None:
771 """Entry point: parse CLI flags, run the scan, and enforce the ratchet.
772
773 Scans the specified directories (or individual files), prints a human
774 summary, optionally writes a JSON report, and exits non-zero when either
775 the pattern violation count exceeds ``--max-any`` or the untyped-def
776 count exceeds ``--max-untyped``.
777 """
778 parser = argparse.ArgumentParser(
779 description=(
780 "Audit typing violations: Any, object, cast, bare collections, "
781 "Optional/Union (legacy), Callable without signature, untyped "
782 "varargs, type: ignore, untyped defs, unconstrained TypeVars."
783 ),
784 )
785 parser.add_argument(
786 "--dirs",
787 nargs="+",
788 default=["muse/", "tests/"],
789 help="Directories or individual .py files to scan. Default: muse/ tests/",
790 )
791 parser.add_argument(
792 "--json",
793 type=str,
794 metavar="PATH",
795 help="Write the JSON report to PATH.",
796 )
797 parser.add_argument(
798 "--max-any",
799 type=int,
800 default=None,
801 metavar="N",
802 help="Exit non-zero if total pattern violations exceed N (ratchet mode).",
803 )
804 parser.add_argument(
805 "--max-untyped",
806 type=int,
807 default=None,
808 metavar="N",
809 help="Exit non-zero if total untyped-def count exceeds N (ratchet mode).",
810 )
811 parser.add_argument(
812 "--top-n",
813 type=int,
814 default=15,
815 metavar="N",
816 help="Number of offenders to display in the human summary. Default: 15.",
817 )
818 args = parser.parse_args()
819
820 all_results: list[FileResult] = []
821 for d in args.dirs:
822 p = Path(d)
823 if p.is_file() and p.suffix == ".py":
824 result = scan_file(p)
825 if result is not None:
826 all_results.append(result)
827 elif p.is_dir():
828 all_results.extend(scan_directory(p))
829 else:
830 print(f"WARNING: {d} does not exist, skipping", file=sys.stderr)
831
832 report = generate_report(all_results)
833 print_human_summary(report, top_n=args.top_n)
834
835 if args.json:
836 out = Path(args.json)
837 out.parent.mkdir(parents=True, exist_ok=True)
838 out.write_text(json.dumps(report, indent=2), encoding="utf-8")
839 print(f" JSON report written to {args.json}")
840
841 failed = False
842
843 if args.max_any is not None:
844 total = report["summary"]["total_any_patterns"]
845 if total > args.max_any:
846 print(
847 f"\n❌ RATCHET FAILED (patterns): {total} violations exceed "
848 f"threshold of {args.max_any}",
849 file=sys.stderr,
850 )
851 failed = True
852 else:
853 print(
854 f"\n✅ RATCHET OK (patterns): {total} violations within "
855 f"threshold of {args.max_any}",
856 )
857
858 if args.max_untyped is not None:
859 untyped = report["summary"]["untyped_defs"]
860 if untyped > args.max_untyped:
861 print(
862 f"\n❌ RATCHET FAILED (untyped defs): {untyped} exceed "
863 f"threshold of {args.max_untyped}",
864 file=sys.stderr,
865 )
866 failed = True
867 else:
868 print(
869 f"\n✅ RATCHET OK (untyped defs): {untyped} within "
870 f"threshold of {args.max_untyped}",
871 )
872
873 if failed:
874 sys.exit(1)
875
876
877 if __name__ == "__main__":
878 main()
File History 1 commit
sha256:51116ec824246acde6abf729e6ba854c223dc5173eff31a645520208023b0652 refactor(bridge): comprehensive spec sweep — close all issu… Sonnet 4.6 minor 28 days ago