tools/typing_audit.py · gabriel/muse

1

#!/usr/bin/env python3

2

"""Typing audit — zero-tolerance type-safety enforcement for mission-critical code.

3

4

Every banned pattern maps to a future Rust port liability: if Python cannot

5

name a type, ``rustc`` cannot either. The ratchet keeps the rule enforced

6

continuously so violations never accumulate.

Patterns checked

----------------

*Any-as-type* — ``dict[str, Any]``, ``list[Any]``, ``type[Any]``,

11

``Any | X``, ``X | Any``, ``Mapping[str, Any]``, etc.

12

13

*object-as-type* — same severity as Any; erases all structural information.

14

15

*cast()* — all usage banned; it conceals a broken callee return type.

16

17

*# type: ignore* — every suppressed error is an unaudited assumption.

18

19

*Bare collections* — ``list``, ``dict``, ``set``, ``tuple`` without ``[T]``.

20

21

*Optional[X]* and *Union[X, Y]* — use ``X | None`` and ``X | Y`` (PEP 604).

22

23

*Legacy typing imports* — ``List``, ``Dict``, ``Set``, ``Tuple``.

24

25

*Bare Callable / Callable returning Any* — must carry a full signature.

26

27

*Untyped varargs* — ``*args: Any``, ``**kwargs: Any``, and unannotated

28

``*args`` / ``**kwargs`` (annotation absent entirely).

29

30

*Untyped function definitions* — missing return or parameter annotation.

31

32

*Unconstrained TypeVar* — ``TypeVar(...)`` with no ``bound=`` and no

33

constraint arguments; behaves identically to ``Any`` in practice.

34

35

*Naked dict at boundary* — ``dict[str, X]`` as a parameter or return type

36

is banned at function/method boundaries. Every dict with known keys must

37

be a ``TypedDict``; every dict with dynamic keys must justify its key space.

38

The only valid ``dict[str, ...]`` at a boundary is an explicitly named

39

``TypedDict`` subclass. This rule exists because ``rustc`` cannot infer

40

struct fields from a ``HashMap<String, X>`` — named fields must be declared.

41

Pattern ``boundary_dict`` fires on ``: dict[str,`` and ``-> dict[str,``.

42

43

*Anonymous dict in collection* — ``list[dict[str, X]]``, ``dict[str, dict[str, X]]``,

44

``tuple[dict[str, X], ...]``. An anonymous dict nested inside a collection is

45

always a named struct waiting to be declared. Use a ``TypedDict`` subclass or a

46

named type alias (e.g. ``list[JSONObject]``, ``list[SymbolHistoryEntry]``).

47

48

Named type aliases do NOT trigger this rule — only the literal expansion does.

49

This is by design: ``list[JSONObject]`` is fine; ``list[dict[str, JSONValue]]`` is not.

50

Rust requires every struct field to be named; ``Vec<HashMap<String, Value>>`` is

51

never the right answer when ``Vec<SymbolEntry>`` is possible.

52

53

``concrete_dict_in_list`` — fires on ``list[dict[str,``, ``tuple[dict[str,``,

54

``set[dict[str,``

55

``dict_of_dict`` — fires on ``dict[str, dict[str,``

Usage::

python tools/typing_audit.py # muse/ + tests/

60

python tools/typing_audit.py --dirs muse/ tests/

61

python tools/typing_audit.py --dirs muse/ --max-any 0 --max-untyped 0

62

python tools/typing_audit.py --json artifacts/typing_audit.json

63

"""

64

65

from __future__ import annotations

import argparse

import ast

import io

import json

import operator

import re

import sys

import tokenize

from collections import defaultdict

76

from pathlib import Path

77

from typing import TypedDict

78

79

# ---------------------------------------------------------------------------

80

# Type aliases — avoid dict[str, X] at function/class-field boundaries.

81

# ---------------------------------------------------------------------------

82

83

type PatternCounts = dict[str, int]

84

type PatternLines = dict[str, list[int]]

85

type PatternMap = dict[str, re.Pattern[str]]

86

type PerFileViolations = dict[str, PatternCounts]

87

88

89

class Violation(TypedDict):

90

"""A single typed violation — one pattern match at one source location."""

file: str

line: int

kind: str

# ---------------------------------------------------------------------------

97

# Data shapes — TypedDicts replace every dict[str, Any] in the old script.

98

# All shapes mirror the Rust struct that will eventually own them.

99

# ---------------------------------------------------------------------------

100

101

102

class UntypedDef(TypedDict):

103

"""A function or method that is missing a required type annotation.

``issue`` is one of:

- ``"missing_return_type"`` — no return annotation.

108

- ``"missing_param_type"`` — a non-self/cls parameter lacks annotation.

109

- ``"untyped_args"`` — ``*args`` is annotated as ``Any`` or has

110

no annotation at all.

111

- ``"untyped_kwargs"`` — ``**kwargs`` is annotated as ``Any`` or has

112

no annotation at all.

113

- ``"unconstrained_typevar"``— a ``TypeVar`` with no ``bound=`` and no

114

positional constraints.

"""

file: str

line: int

name: str

issue: str

class FileResult(TypedDict):

124

"""Typing-violation summary for a single Python source file."""

file: str

imports_any: bool

patterns: PatternCounts

129

pattern_lines: PatternLines

130

type_ignore_variants: PatternCounts

131

untyped_defs: list[UntypedDef]

132

133

134

class Offender(TypedDict):

135

"""A file with at least one typing violation, ranked by total count."""

file: str

total: int

patterns: PatternCounts

140

141

142

class ReportSummary(TypedDict):

143

"""High-level aggregate counts for the entire scan."""

144

145

total_files_scanned: int

146

files_importing_any: int

147

total_any_patterns: int

untyped_defs: int

class Report(TypedDict):

152

"""Full typing-audit report produced by :func:`generate_report`."""

153

154

summary: ReportSummary

155

pattern_totals: PatternCounts

156

type_ignore_variants: PatternCounts

157

top_offenders: list[Offender]

158

per_file: PerFileViolations

159

violations: list[Violation]

160

untyped_defs: list[UntypedDef]

161

162

163

# ---------------------------------------------------------------------------

164

# String-literal masking

165

# ---------------------------------------------------------------------------

166

167

168

def _mask_string_literals(source: str) -> str:

169

"""Replace string-literal content with spaces, preserving newlines.

170

171

Pattern matching runs on the masked source so that raw regex strings,

172

docstrings, and string constants never produce false positives. All

173

newlines are preserved so that line numbers stay accurate.

174

175

Tokenisation errors (e.g. incomplete source snippets) are silently

176

ignored — the original source is returned unchanged so the caller still

177

produces *some* output rather than silently dropping the file.

178

179

Args:

180

source: Full UTF-8 source text of a Python file.

181

182

Returns:

183

A copy of *source* with the content of every string token replaced

184

by space characters (newlines within multi-line strings preserved).

185

"""

186

chars = list(source)

187

lines = source.splitlines(keepends=True)

188

189

# Pre-compute cumulative line offsets for O(1) (row, col) → offset.

190

offsets: list[int] = [0]

191

for ln in lines:

192

offsets.append(offsets[-1] + len(ln))

193

194

def _abs(row: int, col: int) -> int:

195

return offsets[row - 1] + col

196

197

# Token types that contain string literal content — including f-string

198

# middle segments which are FSTRING_MIDDLE (not STRING) in Python 3.12+.

199

_FSTRING_MIDDLE = getattr(tokenize, "FSTRING_MIDDLE", None)

200

_STRING_TYPES = {tokenize.STRING}

201

if _FSTRING_MIDDLE is not None:

202

_STRING_TYPES.add(_FSTRING_MIDDLE)

203

204

try:

205

gen = tokenize.generate_tokens(io.StringIO(source).readline)

206

for tok_type, _tok_str, (srow, scol), (erow, ecol), _ in gen:

207

if tok_type not in _STRING_TYPES:

208

continue

209

start = _abs(srow, scol)

210

end = _abs(erow, ecol)

211

for i in range(start, end):

212

if chars[i] not in {"\n", "\r"}:

213

chars[i] = " "

214

except tokenize.TokenError:

215

pass

216

217

return "".join(chars)

218

219

220

# ---------------------------------------------------------------------------

221

# Pattern registry

222

# ---------------------------------------------------------------------------

223

224

#: All patterns that count toward the violation total.

225

#: Keys are stable identifiers used in JSON output and tests.

226

#:

227

#: NOTE: do NOT use re.IGNORECASE — Python type annotations are case-sensitive.

228

#: ``List`` and ``list`` are distinct identifiers; matching ``list[any]``

229

#: (where ``any`` is the built-in function) would be a false positive.

230

_PATTERNS: PatternMap = {

231

# Any-as-type ─────────────────────────────────────────────────────────

232

"dict_str_any": re.compile(r"\bdict\[str,\s*Any\]|\bDict\[str,\s*Any\]"),

233

"list_any": re.compile(r"\blist\[Any\]|\bList\[Any\]"),

234

"type_any": re.compile(r"\btype\[Any\]"),

235

"any_in_union": re.compile(r"\bAny\s*\||\|\s*Any\b"),

236

"return_any": re.compile(r"->\s*Any\b"),

237

"param_any": re.compile(r":\s*Any\b"),

238

"mapping_any": re.compile(r"\bMapping\[str,\s*Any\]"),

239

"optional_any": re.compile(r"\bOptional\[Any\]"),

240

"sequence_any": re.compile(r"\bSequence\[Any\]|\bIterable\[Any\]"),

241

"tuple_any": re.compile(r"\btuple\[[^\n]*Any[^\n]*\]|\bTuple\[[^\n]*Any[^\n]*\]"),

242

# object-as-type ──────────────────────────────────────────────────────

243

"param_object": re.compile(r":\s*object\b"),

244

"return_object": re.compile(r"->\s*object\b"),

245

# Handles one level of nesting, e.g. dict[str, list[object]].

246

# NOTE: Mapping is intentionally excluded — Mapping[str, object] is the

247

# correct type for read-only, covariant mappings at framework boundaries

248

# (e.g. Jinja2 template contexts). Mapping[str, Any] is caught separately

249

# by mapping_any. Only mutable collection types need this guard.

250

"collection_object": re.compile(

251

r"\b(?:dict|list|set|tuple|Sequence)"

252

r"\[[^\n\[\]]*(?:\[[^\n\[\]]*\][^\n\[\]]*)*\bobject\b"

253

),

254

# cast() — banned ─────────────────────────────────────────────────────

255

"cast_usage": re.compile(r"(?<![.\w])cast\("),

256

# type: ignore — only flag blanket suppresses (no specific error code).

257

# ``# type: ignore[some-code]`` is acceptable when the exact issue is known;

258

# ``# type: ignore`` with no code is a blind suppression and always banned.

259

"type_ignore": re.compile(r"#\s*type:\s*ignore(?!\s*\[)"),

260

# Bare collections (no type parameters) ───────────────────────────────

261

# Negative lookaheads exclude parameterised forms and prose.

262

"bare_list": re.compile(r"(?::\s*|->\s*)list\b(?!\[|\(|\s+[a-z])"),

263

"bare_dict": re.compile(r"(?::\s*|->\s*)dict\b(?!\[|\(|\s+[a-z])"),

264

"bare_set": re.compile(r"(?::\s*|->\s*)set\b(?!\[|\(|\s+[a-z])"),

265

"bare_tuple": re.compile(r"(?::\s*|->\s*)tuple\b(?!\[|\(|\s+[a-z])"),

266

# Optional[X] — use X | None (PEP 604) ────────────────────────────────

267

"optional_usage": re.compile(r"\bOptional\[(?!Any\b)"),

268

# Union[X, Y] — use X | Y (PEP 604) ──────────────────────────────────

269

"union_usage": re.compile(r"\bUnion\["),

270

# Legacy typing imports (use lowercase builtins) ──────────────────────

271

"legacy_List": re.compile(r"\bList\["),

272

"legacy_Dict": re.compile(r"\bDict\["),

273

"legacy_Set": re.compile(r"\bSet\["),

274

"legacy_Tuple": re.compile(r"\bTuple\["),

275

# Callable — must carry full signature ────────────────────────────────

276

"bare_callable": re.compile(r"(?::\s*|->\s*)Callable\b(?!\[)"),

277

"callable_any": re.compile(r"\bCallable\[[^\n]*,\s*Any\s*\]"),

278

# Untyped varargs — *args: Any / **kwargs: Any ────────────────────────

279

# Unannotated *args/**kwargs are caught by the AST walker instead.

280

"varargs_any": re.compile(r"\*{1,2}\w+:\s*Any\b"),

281

# Naked dict at boundary — dict[str, X] as param/return type is banned.

282

# Every structured boundary must use a TypedDict (or dataclass/enum).

283

# Matches ": dict[str," and "-> dict[str," — the two annotation positions.

284

#

285

# APPROVED alternatives at boundaries:

286

# - ReadOnlyJSONObject (= Mapping[str, JSONValue]) for read-only JSON params

287

# - A named TypedDict subclass for any dict with statically known keys

288

#

289

# Mapping[str, JSONValue] is covariant so any dict[str, T where T ⊆ JSONValue]

290

# is assignable to it. This pattern (boundary_dict) does NOT fire on

291

# Mapping[...]; mapping_any does NOT fire on Mapping[str, JSONValue].

292

# Therefore Mapping[str, JSONValue] is the safe boundary form for JSON dicts.

293

"boundary_dict": re.compile(r"(?::\s*|->\s*)dict\[str\s*,"),

294

# Anonymous dict in collection — list[dict[str, X]] / dict[str, dict[str, X]].

295

# A dict nested inside a collection is always a named struct opportunity.

296

# Use a TypedDict subclass or a named type alias (e.g. list[JSONObject]).

297

# Named aliases do NOT trigger this rule — only the literal expansion does.

298

# This is intentional: list[JSONObject] is fine; list[dict[str, JSONValue]] is not.

299

"concrete_dict_in_list": re.compile(

300

r"\b(?:list|tuple|set)\[dict\[str,"

301

),

302

"dict_of_dict": re.compile(

303

r"\bdict\[str,\s*dict\[str,"

),

}

# Category groupings for the human-readable report, in display order.

308

_CATEGORY_ORDER: list[tuple[str, list[str]]] = [

309

("Any-as-type", [

310

"dict_str_any", "list_any", "type_any", "any_in_union",

311

"return_any", "param_any",

312

"mapping_any", "optional_any", "sequence_any", "tuple_any",

313

]),

314

("object-as-type", ["param_object", "return_object", "collection_object"]),

315

("cast() usage", ["cast_usage"]),

316

("type: ignore", ["type_ignore"]),

317

("Bare collections", ["bare_list", "bare_dict", "bare_set", "bare_tuple"]),

318

("Optional (use X | None)", ["optional_usage"]),

319

("Union (use X | Y)", ["union_usage"]),

320

("Legacy typing imports", ["legacy_List", "legacy_Dict", "legacy_Set", "legacy_Tuple"]),

321

("Callable (must carry full signature)", ["bare_callable", "callable_any"]),

322

("Untyped varargs", ["varargs_any"]),

323

("Naked dict at boundary (use TypedDict)", ["boundary_dict"]),

324

("Anonymous dict in collection (use TypedDict or named alias)", [

325

"concrete_dict_in_list", "dict_of_dict",

]),

]

# Directories that are never source code and must be skipped during scanning.

330

_SKIP_DIRS: frozenset[str] = frozenset({

331

"venv", ".venv", "env", ".env",

332

"__pycache__",

333

".git", ".muse", ".mypy_cache", ".ruff_cache", ".pytest_cache", ".tox",

334

"dist", "build", "site-packages", "__pypackages__",

})

# ---------------------------------------------------------------------------

339

# Pattern helpers

340

# ---------------------------------------------------------------------------

341

342

343

def _count_pattern_in_line(line: str, pattern: re.Pattern[str]) -> int:

344

"""Return the number of non-overlapping matches of *pattern* in *line*."""

345

return len(pattern.findall(line))

346

347

348

def _imports_any(source: str) -> bool:

349

"""Return ``True`` if the source file imports ``Any`` from ``typing``

350

or ``typing_extensions``.

351

352

Excludes commented-out import lines (lines where ``from`` is preceded only

353

by ``#`` and optional whitespace).

354

"""

355

return bool(re.search(

356

r"^[ \t]*from\s+typing(?:_extensions)?\s+import\s+.*\bAny\b",

source,

re.MULTILINE,

))

def _classify_type_ignore(line: str) -> str:

363

"""Classify the style of a ``# type: ignore`` comment.

364

365

Returns ``"type_ignore[code]"`` for code-specific ignores, or

366

``"type_ignore[blanket]"`` for bare ``# type: ignore``.

367

368

Args:

369

line: A single source line that contains ``# type: ignore``.

370

371

Returns:

372

A string label for the variant.

373

"""

374

m = re.search(r"#\s*type:\s*ignore\[([^\]]+)\]", line)

375

if m:

376

return f"type_ignore[{m.group(1)}]"

377

return "type_ignore[blanket]"

378

379

380

# ---------------------------------------------------------------------------

381

# AST-based detection

382

# ---------------------------------------------------------------------------

383

384

385

def _is_any_annotation(node: ast.expr | None) -> bool:

386

"""Return ``True`` if *node* is the bare ``Any`` name."""

387

return isinstance(node, ast.Name) and node.id == "Any"

388

389

390

def _find_untyped_defs(source: str, filepath: str) -> list[UntypedDef]:

391

"""Walk the AST and collect every function with a missing annotation.

Checks:

- Missing return type (``node.returns is None``).

396

- Missing parameter annotation (excluding ``self`` and ``cls``).

397

- ``*args`` annotated as ``Any`` **or** with no annotation at all.

398

- ``**kwargs`` annotated as ``Any`` **or** with no annotation at all.

399

- ``TypeVar(...)`` assignments with no ``bound=`` and no constraints.

400

401

Line numbers for parameter violations use the argument's own line number

402

(``arg.lineno``) rather than the function definition line, so the report

403

points directly at the problematic parameter.

404

405

Skips files that cannot be parsed.

406

407

Args:

408

source: Full source text of the file.

409

filepath: Path string used in the returned records.

410

411

Returns:

412

A list of :class:`UntypedDef` records, one per violation found.

413

"""

414

results: list[UntypedDef] = []

415

try:

416

tree = ast.parse(source)

except SyntaxError:

return results

for node in ast.walk(tree):

421

if not isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):

422

continue

423

424

if node.returns is None:

425

results.append(UntypedDef(

file=filepath,

line=node.lineno,

name=node.name,

issue="missing_return_type",

))

all_args = (

node.args.args

+ node.args.posonlyargs

435

+ node.args.kwonlyargs

436

)

437

for arg in all_args:

438

if arg.arg in {"self", "cls"}:

439

continue

440

if arg.annotation is None:

441

results.append(UntypedDef(

442

file=filepath,

443

line=arg.lineno,

444

name=f"{node.name}.{arg.arg}",

445

issue="missing_param_type",

446

))

447

448

vararg = node.args.vararg

449

if vararg is not None:

450

if vararg.annotation is None or _is_any_annotation(vararg.annotation):

451

results.append(UntypedDef(

452

file=filepath,

453

line=vararg.lineno,

454

name=f"{node.name}.*{vararg.arg}",

455

issue="untyped_args",

456

))

457

458

kwarg = node.args.kwarg

459

if kwarg is not None:

460

if kwarg.annotation is None or _is_any_annotation(kwarg.annotation):

461

results.append(UntypedDef(

462

file=filepath,

463

line=kwarg.lineno,

464

name=f"{node.name}.**{kwarg.arg}",

465

issue="untyped_kwargs",

466

))

467

468

# TypeVar without constraints or bound — behaves identically to Any.

469

results.extend(_find_unconstrained_typevars(tree, filepath))

return results

def _find_unconstrained_typevars(tree: ast.Module, filepath: str) -> list[UntypedDef]:

475

"""Return a record for every ``TypeVar(...)`` with no bound or constraints.

476

477

A bare ``T = TypeVar("T")`` is semantically equivalent to ``T: Any``.

478

The Rust port requires every generic to carry an explicit trait bound.

479

480

Args:

481

tree: Parsed AST of the file.

482

filepath: Path string used in the returned records.

483

484

Returns:

485

A list of :class:`UntypedDef` records for unconstrained ``TypeVar``

486

definitions.

487

"""

488

results: list[UntypedDef] = []

489

for node in ast.walk(tree):

490

# Match: T = TypeVar("T") or T = TypeVar("T", bound=...)

491

if not isinstance(node, ast.Assign):

492

continue

493

value = node.value

494

if not isinstance(value, ast.Call):

495

continue

496

func = value.func

497

if not (isinstance(func, ast.Name) and func.id == "TypeVar"):

498

continue

499

# A TypeVar is constrained when it has:

500

# - positional args beyond the name (constraint types), OR

501

# - a keyword arg named "bound"

502

extra_args = value.args[1:] # args[0] is the name string

503

kw_names = {kw.arg for kw in value.keywords}

504

if extra_args or "bound" in kw_names:

505

continue # constrained — OK

506

# Unconstrained TypeVar.

507

target_name = (

508

node.targets[0].id

509

if isinstance(node.targets[0], ast.Name)

510

else "<TypeVar>"

511

)

512

results.append(UntypedDef(

file=filepath,

line=node.lineno,

name=target_name,

issue="unconstrained_typevar",

))

return results

# ---------------------------------------------------------------------------

522

# File and directory scanner

523

# ---------------------------------------------------------------------------

524

525

526

def scan_file(filepath: Path) -> FileResult | None:

527

"""Scan a single Python file and return its violation summary.

528

529

String literals are masked before pattern matching so that raw regex

530

strings and docstring prose never produce false positives. The

531

``# type: ignore`` check runs on the *original* source because those

532

comments are not string literals.

533

534

Returns ``None`` when the file cannot be read (I/O or encoding error).

535

536

Args:

537

filepath: Absolute or relative path to the Python file.

538

539

Returns:

540

A :class:`FileResult` on success, ``None`` on I/O failure.

541

"""

542

try:

543

source = filepath.read_text(encoding="utf-8")

544

except (OSError, UnicodeDecodeError):

545

return None

546

547

masked = _mask_string_literals(source)

548

549

original_lines = source.splitlines()

550

masked_lines = masked.splitlines()

551

552

patterns: defaultdict[str, int] = defaultdict(int)

553

pattern_lines: defaultdict[str, list[int]] = defaultdict(list)

554

type_ignore_variants: defaultdict[str, int] = defaultdict(int)

555

556

for lineno, (orig_line, masked_line) in enumerate(

557

zip(original_lines, masked_lines), 1

558

):

559

stripped = masked_line.strip()

560

if not stripped or stripped.startswith("#"):

561

continue

562

563

for name, pattern in _PATTERNS.items():

564

# All patterns run on the masked line — string literals are blanked

565

# so raw regex strings and docstring prose never trigger false

566

# positives. Comments are NOT masked (they are not string tokens)

567

# so "# type: ignore" on real code lines is still detected.

568

#

569

# Exception: dunder methods legitimately use `: object` (e.g.

570

# `__eq__(self, other: object)`, `__contains__(self, item: object)`).

571

# Skip param_object/return_object for those signatures.

572

if name in {"param_object", "return_object"}:

573

if re.search(r"def\s+__\w+__\s*\(", masked_line):

574

continue

575

# boundary_dict fires on `: dict[str,` and `-> dict[str,`.

576

# Local variable annotations (e.g. ``x: dict[str, int] = {}``) are

577

# NOT boundaries — only function parameter and return annotations are.

578

# `: dict[str,` on a non-def line is a local variable; skip it.

579

if name == "boundary_dict" and ": dict[str," in masked_line:

580

if not re.search(r"\bdef\b", masked_line):

581

continue

582

count = _count_pattern_in_line(masked_line, pattern)

583

if count > 0:

584

patterns[name] += count

585

pattern_lines[name].append(lineno)

586

587

if name == "type_ignore":

588

# Classify against the original line so we can distinguish

589

# blanket ignores from code-specific ones.

590

variant = _classify_type_ignore(orig_line)

591

type_ignore_variants[variant] += 1

return FileResult(

file=str(filepath),

imports_any=_imports_any(source),

596

patterns=dict(patterns),

597

pattern_lines=dict(pattern_lines),

598

type_ignore_variants=dict(type_ignore_variants),

599

untyped_defs=_find_untyped_defs(source, str(filepath)),

)

def scan_directory(directory: Path) -> list[FileResult]:

604

"""Recursively scan all Python files in *directory*.

605

606

Skips virtual environments, caches, build artefacts, and VCS/tool

607

metadata directories (see ``_SKIP_DIRS``).

608

609

Args:

610

directory: Root of the directory tree to scan.

611

612

Returns:

613

A list of :class:`FileResult` objects, one per successfully scanned file.

614

"""

615

results: list[FileResult] = []

616

for py_file in sorted(directory.rglob("*.py")):

617

if any(part in _SKIP_DIRS for part in py_file.parts):

618

continue

619

file_result = scan_file(py_file)

620

if file_result is not None:

621

results.append(file_result)

return results

# ---------------------------------------------------------------------------

626

# Report generation

627

# ---------------------------------------------------------------------------

628

629

630

def _offender_sort_key(entry: Offender) -> int:

631

"""Return the sort key for an :class:`Offender` (total violation count)."""

632

return entry["total"]

633

634

635

def generate_report(results: list[FileResult]) -> Report:

636

"""Aggregate per-file scan results into a :class:`Report`.

637

638

Args:

639

results: List of :class:`FileResult` objects from :func:`scan_file`

640

or :func:`scan_directory`.

641

642

Returns:

643

A :class:`Report` ready for human display or JSON serialisation.

644

"""

645

totals: defaultdict[str, int] = defaultdict(int)

646

files_with_any_import = 0

647

per_file: PerFileViolations = {}

648

top_offenders: list[Offender] = []

649

all_type_ignore_variants: defaultdict[str, int] = defaultdict(int)

650

all_untyped_defs: list[UntypedDef] = []

651

all_violations: list[Violation] = []

for r in results:

filepath = r["file"]

if r["imports_any"]:

files_with_any_import += 1

657

658

file_total = 0

659

file_patterns: PatternCounts = {}

660

for pattern, count in r["patterns"].items():

661

totals[pattern] += count

662

file_patterns[pattern] = count

663

file_total += count

664

for lineno in r["pattern_lines"].get(pattern, []):

665

all_violations.append(Violation(file=filepath, line=lineno, kind=pattern))

666

667

if file_total > 0:

668

per_file[filepath] = file_patterns

669

top_offenders.append(Offender(

670

file=filepath,

671

total=file_total,

672

patterns=file_patterns,

673

))

674

675

for variant, count in r["type_ignore_variants"].items():

676

all_type_ignore_variants[variant] += count

677

678

all_untyped_defs.extend(r["untyped_defs"])

679

680

all_violations.sort(key=lambda v: (v["file"], v["line"]))

681

top_offenders.sort(key=_offender_sort_key, reverse=True)

682

683

return Report(

684

summary=ReportSummary(

685

total_files_scanned=len(results),

686

files_importing_any=files_with_any_import,

687

total_any_patterns=sum(totals.values()),

688

untyped_defs=len(all_untyped_defs),

689

),

690

pattern_totals=dict(totals),

691

type_ignore_variants=dict(all_type_ignore_variants),

692

# Store all offenders in JSON; display is capped separately in the

693

# human-readable printer.

694

top_offenders=top_offenders,

695

per_file=per_file,

696

violations=all_violations,

697

# Store the full list — callers that need all records can use --json.

698

untyped_defs=all_untyped_defs,

)

# ---------------------------------------------------------------------------

703

# Human-readable report printer

704

# ---------------------------------------------------------------------------

705

706

707

def print_human_summary(report: Report, top_n: int = 15) -> None:

708

"""Print a formatted, human-readable summary of *report* to stdout.

709

710

Args:

711

report: A :class:`Report` produced by :func:`generate_report`.

712

top_n: How many offenders to display in the top-offenders list.

713

"""

714

s = report["summary"]

715

totals = report["pattern_totals"]

716

717

print(f"\n{'=' * 70}")

718

print(" TYPING AUDIT — Violation Report")

719

print("=" * 70)

720

print(f" Files scanned: {s['total_files_scanned']}")

721

print(f" Files importing Any: {s['files_importing_any']}")

722

print(f" Total violations: {s['total_any_patterns']}")

723

print(f" Untyped defs: {s['untyped_defs']}")

724

print()

725

726

has_violations = False

727

for category, pattern_names in _CATEGORY_ORDER:

728

category_total = sum(totals.get(p, 0) for p in pattern_names)

729

if category_total == 0:

730

continue

731

has_violations = True

732

print(f" {category}:")

733

for p in pattern_names:

734

count = totals.get(p, 0)

735

if count > 0:

736

print(f" {p:38s} {count:5d}")

737

print()

738

739

if not has_violations:

740

print(" Pattern breakdown: (none)")

741

print()

742

743

if report["type_ignore_variants"]:

744

print(" # type: ignore variants:")

745

for variant, count in sorted(

746

report["type_ignore_variants"].items(),

747

key=operator.itemgetter(1),

748

reverse=True,

749

):

750

print(f" {variant:44s} {count:5d}")

751

print()

752

753

if report["violations"]:

754

print(" Violations (file:line [kind]):")

755

for v in report["violations"]:

756

print(f" {v['file']}:{v['line']} [{v['kind']}]")

757

print()

758

759

print(f" Top {top_n} offenders:")

760

for entry in report["top_offenders"][:top_n]:

761

print(f" {entry['total']:4d} {entry['file']}")

762

print("=" * 70 + "\n")

763

764

765

# ---------------------------------------------------------------------------

766

# CLI

767

# ---------------------------------------------------------------------------

def main() -> None:

"""Entry point: parse CLI flags, run the scan, and enforce the ratchet.

772

773

Scans the specified directories (or individual files), prints a human

774

summary, optionally writes a JSON report, and exits non-zero when either

775

the pattern violation count exceeds ``--max-any`` or the untyped-def

776

count exceeds ``--max-untyped``.

777

"""

778

parser = argparse.ArgumentParser(

779

description=(

780

"Audit typing violations: Any, object, cast, bare collections, "

781

"Optional/Union (legacy), Callable without signature, untyped "

782

"varargs, type: ignore, untyped defs, unconstrained TypeVars."

),

)

parser.add_argument(

"--dirs",

nargs="+",

default=["muse/", "tests/"],

789

help="Directories or individual .py files to scan. Default: muse/ tests/",

)

parser.add_argument(

"--json",

type=str,

metavar="PATH",

help="Write the JSON report to PATH.",

)

parser.add_argument(

"--max-any",

type=int,

default=None,

metavar="N",

help="Exit non-zero if total pattern violations exceed N (ratchet mode).",

)

parser.add_argument(

"--max-untyped",

type=int,

default=None,

metavar="N",

help="Exit non-zero if total untyped-def count exceeds N (ratchet mode).",

)

parser.add_argument(

"--top-n",

type=int,

default=15,

metavar="N",

help="Number of offenders to display in the human summary. Default: 15.",

817

)

818

args = parser.parse_args()

819

820

all_results: list[FileResult] = []

821

for d in args.dirs:

822

p = Path(d)

823

if p.is_file() and p.suffix == ".py":

824

result = scan_file(p)

825

if result is not None:

826

all_results.append(result)

827

elif p.is_dir():

828

all_results.extend(scan_directory(p))

829

else:

830

print(f"WARNING: {d} does not exist, skipping", file=sys.stderr)

831

832

report = generate_report(all_results)

833

print_human_summary(report, top_n=args.top_n)

834

835

if args.json:

836

out = Path(args.json)

837

out.parent.mkdir(parents=True, exist_ok=True)

838

out.write_text(json.dumps(report, indent=2), encoding="utf-8")

839

print(f" JSON report written to {args.json}")

failed = False

if args.max_any is not None:

844

total = report["summary"]["total_any_patterns"]

845

if total > args.max_any:

846

print(

847

f"\n❌ RATCHET FAILED (patterns): {total} violations exceed "

848

f"threshold of {args.max_any}",

file=sys.stderr,

)

failed = True

else:

print(

f"\n✅ RATCHET OK (patterns): {total} violations within "

855

f"threshold of {args.max_any}",

856

)

857

858

if args.max_untyped is not None:

859

untyped = report["summary"]["untyped_defs"]

860

if untyped > args.max_untyped:

861

print(

862

f"\n❌ RATCHET FAILED (untyped defs): {untyped} exceed "

863

f"threshold of {args.max_untyped}",

file=sys.stderr,

)

failed = True

else:

print(

f"\n✅ RATCHET OK (untyped defs): {untyped} within "

870

f"threshold of {args.max_untyped}",

)

if failed:

sys.exit(1)

if __name__ == "__main__":

878

main()