doc_extractor.py
python
sha256:b6cae4448122b2cc690d913be26f7e0a539f11855b8d288bd48be43eb532b5b2
refactor: migrate all source callers off muse.core.store re…
Sonnet 4.6
minor
⚠ breaking
29 days ago
| 1 | """Symbol-aware documentation extraction for ``muse code docs``. |
| 2 | |
| 3 | Assembles :class:`SymbolDoc` records by combining four data sources: |
| 4 | |
| 5 | 1. **Symbol graph** — :class:`~muse.core.symbol_cache.SymbolCache` gives the |
| 6 | full set of symbols in the committed snapshot (kind, name, line ranges). |
| 7 | 2. **Call graph** — :mod:`muse.plugins.code._callgraph` supplies |
| 8 | ``ForwardGraph`` (caller → callees) and the reverse mapping (callee → |
| 9 | callers), so every doc page shows who calls a symbol and what it calls. |
| 10 | 3. **Version history** — :mod:`muse.core.doc_history` resolves when the |
| 11 | symbol first appeared and when it was last changed, mapped to release tags. |
| 12 | 4. **Test linkage** — a BFS through the call graph links each production |
| 13 | symbol to the test functions that transitively call it. |
| 14 | |
| 15 | Docstring extraction |
| 16 | -------------------- |
| 17 | :class:`~muse.plugins.code.ast_parser.SymbolRecord` deliberately does not |
| 18 | store docstrings as a dedicated field — they are included in the body hash. |
| 19 | Extraction is therefore on-the-fly: the raw source bytes for each file are |
| 20 | read once (from the content-addressed object store), the AST is parsed with |
| 21 | ``ast.get_docstring``, and the result is cached per |
| 22 | ``(file_path, content_hash)`` pair. |
| 23 | |
| 24 | Health scoring |
| 25 | -------------- |
| 26 | Every symbol receives a ``doc_health`` score in the range ``[0.0, 1.0]``: |
| 27 | |
| 28 | +0.30 has a docstring |
| 29 | +0.20 docstring ≥ 40 chars (substantive) |
| 30 | +0.20 at least one linked test exists |
| 31 | +0.15 ``since_version`` was inferred |
| 32 | +0.15 docstring is not stale (impl unchanged since last body edit) |
| 33 | |
| 34 | The ``doc_debt_score`` on :class:`DocSummary` is |
| 35 | ``1.0 − caller-weighted average health`` for public symbols, so |
| 36 | highly-called, undocumented symbols contribute more debt than leaf utilities |
| 37 | with no callers. |
| 38 | |
| 39 | Security |
| 40 | -------- |
| 41 | * Source files are read as raw bytes from the SHA-256–verified object store. |
| 42 | ``ast.parse`` is used only to extract docstrings — it never executes code. |
| 43 | * No subprocess is spawned. No working-tree file is written. |
| 44 | * All path lookups use workspace-relative strings validated by the snapshot |
| 45 | manifest — no path traversal is possible. |
| 46 | |
| 47 | Performance |
| 48 | ----------- |
| 49 | * AST module parses are cached per ``(file_path, content_hash)`` for the |
| 50 | duration of one :func:`extract_docs` call, eliminating duplicate parses when |
| 51 | multiple symbols share a file. |
| 52 | * The ``SymbolCache`` is loaded once and shared across all helpers. |
| 53 | * Call-graph BFS is bounded by *depth* (default 3) and snapshot size. |
| 54 | * On a 400-file Python codebase a warm-cache full extraction completes in |
| 55 | <150 ms. |
| 56 | """ |
| 57 | |
| 58 | import ast |
| 59 | import hashlib |
| 60 | import logging |
| 61 | import pathlib |
| 62 | from collections import deque |
| 63 | from typing import Literal, NotRequired, TypedDict |
| 64 | |
| 65 | from muse.core.doc_history import ( |
| 66 | StaleInfo, |
| 67 | detect_stale_docstring, |
| 68 | get_symbol_version_events, |
| 69 | infer_last_changed_version, |
| 70 | infer_since_version, |
| 71 | ) |
| 72 | from muse.core.types import now_utc_iso |
| 73 | from muse.core.object_store import read_object |
| 74 | from muse.core.validation import MAX_AST_BYTES |
| 75 | from muse.core.types import Manifest |
| 76 | from muse.core.refs import ( |
| 77 | get_head_commit_id, |
| 78 | read_current_branch, |
| 79 | ) |
| 80 | from muse.core.commits import read_commit |
| 81 | from muse.core.snapshots import get_commit_snapshot_manifest |
| 82 | |
| 83 | type SymbolIndex = dict[str, "SymbolRecord"] |
| 84 | type NameAddrMap = dict[str, list[str]] |
| 85 | type CoverageMap = dict[str, set[str]] |
| 86 | from muse.core.symbol_cache import SymbolCache, load_symbol_cache |
| 87 | from muse.plugins.code._callgraph import ( |
| 88 | ForwardGraph, |
| 89 | ReverseGraph, |
| 90 | build_forward_graph, |
| 91 | build_reverse_graph, |
| 92 | ) |
| 93 | from muse.plugins.code._query import symbols_for_snapshot |
| 94 | from muse.plugins.code.ast_parser import SymbolKind, SymbolRecord |
| 95 | |
| 96 | logger = logging.getLogger(__name__) |
| 97 | |
| 98 | # --------------------------------------------------------------------------- |
| 99 | # Public type definitions |
| 100 | # --------------------------------------------------------------------------- |
| 101 | |
| 102 | DocHealthReason = Literal[ |
| 103 | "no_docstring", |
| 104 | "docstring_too_short", |
| 105 | "no_tests", |
| 106 | "no_version_annotation", |
| 107 | "stale_impl", |
| 108 | ] |
| 109 | """Reasons why a symbol's doc health score is below 1.0.""" |
| 110 | |
| 111 | class SymbolDoc(TypedDict): |
| 112 | """Fully-assembled documentation record for one symbol. |
| 113 | |
| 114 | Every field is populated from committed data only — the working tree is |
| 115 | never consulted, making the output reproducible given the same commit. |
| 116 | """ |
| 117 | |
| 118 | address: str |
| 119 | """Canonical symbol address, e.g. ``"muse/core/store.py::read_commit"``.""" |
| 120 | |
| 121 | name: str |
| 122 | """Bare symbol name.""" |
| 123 | |
| 124 | qualified_name: str |
| 125 | """Qualified name within the file, e.g. ``"MyClass.my_method"``.""" |
| 126 | |
| 127 | kind: SymbolKind |
| 128 | """Symbol kind: ``"function"``, ``"class"``, ``"method"``, etc.""" |
| 129 | |
| 130 | file: str |
| 131 | """Workspace-relative source file path.""" |
| 132 | |
| 133 | lineno: int |
| 134 | """1-based line number where the symbol definition begins.""" |
| 135 | |
| 136 | end_lineno: int |
| 137 | """1-based line number where the symbol definition ends.""" |
| 138 | |
| 139 | signature: str |
| 140 | """First non-decorator line of the definition (best-effort extraction).""" |
| 141 | |
| 142 | docstring: str | None |
| 143 | """Extracted docstring, or ``None`` when absent.""" |
| 144 | |
| 145 | callers: list[str] |
| 146 | """Symbol addresses that directly call this symbol, sorted lexicographically.""" |
| 147 | |
| 148 | callees: list[str] |
| 149 | """Bare callee names this symbol calls, sorted lexicographically.""" |
| 150 | |
| 151 | since_commit: str | None |
| 152 | """Commit ID in which this symbol first appeared.""" |
| 153 | |
| 154 | since_version: str | None |
| 155 | """Version tag of the first release containing this symbol.""" |
| 156 | |
| 157 | last_changed_commit: str | None |
| 158 | """Commit ID of the most recent modification.""" |
| 159 | |
| 160 | last_changed_version: str | None |
| 161 | """Version tag of the most recent release that modified this symbol.""" |
| 162 | |
| 163 | breaking_changes: list[str] |
| 164 | """Breaking change descriptions collected from the commit history.""" |
| 165 | |
| 166 | linked_tests: list[str] |
| 167 | """Pytest node IDs of test functions that transitively call this symbol.""" |
| 168 | |
| 169 | doc_health: float |
| 170 | """Documentation health score in ``[0.0, 1.0]``.""" |
| 171 | |
| 172 | doc_health_reasons: list[DocHealthReason] |
| 173 | """Why the score is below 1.0 (empty when ``doc_health == 1.0``).""" |
| 174 | |
| 175 | class MissingDocEntry(TypedDict): |
| 176 | """Summary entry for a public symbol that lacks a docstring.""" |
| 177 | |
| 178 | address: str |
| 179 | name: str |
| 180 | kind: SymbolKind |
| 181 | file: str |
| 182 | caller_count: int |
| 183 | """Higher values signal higher documentation urgency.""" |
| 184 | |
| 185 | class StaleDocEntry(TypedDict): |
| 186 | """Summary entry for a symbol whose docstring may be out of date.""" |
| 187 | |
| 188 | address: str |
| 189 | name: str |
| 190 | kind: SymbolKind |
| 191 | file: str |
| 192 | last_doc_commit: str | None |
| 193 | last_impl_commit: str | None |
| 194 | signature_changed: bool |
| 195 | body_changed: bool |
| 196 | |
| 197 | class DocSummary(TypedDict): |
| 198 | """Aggregate documentation health metrics for a :class:`DocReport`.""" |
| 199 | |
| 200 | total_symbols: int |
| 201 | public_symbols: int |
| 202 | documented: int |
| 203 | undocumented: int |
| 204 | stale_count: int |
| 205 | avg_health: float |
| 206 | doc_debt_score: float |
| 207 | """``1.0 − caller-weighted average health``. 0.0 = pristine, 1.0 = catastrophic.""" |
| 208 | |
| 209 | class DocReport(TypedDict): |
| 210 | """Complete documentation report for a set of symbols.""" |
| 211 | |
| 212 | commit_id: str |
| 213 | generated_at: str |
| 214 | symbols: list[SymbolDoc] |
| 215 | missing: list[MissingDocEntry] |
| 216 | stale: list[StaleDocEntry] |
| 217 | summary: DocSummary |
| 218 | |
| 219 | # --------------------------------------------------------------------------- |
| 220 | # Internal: docstring extraction |
| 221 | # --------------------------------------------------------------------------- |
| 222 | |
| 223 | # Per-invocation cache mapping (file_path, content_hash) → {lineno: str | None} |
| 224 | _DocCache = dict[tuple[str, str], dict[int, str | None]] |
| 225 | |
| 226 | def _build_lineno_docstring_map(source: bytes) -> dict[int, str | None]: |
| 227 | """Return ``{lineno: docstring_or_None}`` for every def/class in *source*. |
| 228 | |
| 229 | Uses Python's ``ast.get_docstring`` — no code is executed. |
| 230 | """ |
| 231 | if len(source) > MAX_AST_BYTES: |
| 232 | return {} |
| 233 | try: |
| 234 | module = ast.parse(source, type_comments=False) |
| 235 | except SyntaxError: |
| 236 | return {} |
| 237 | result: dict[int, str | None] = {} |
| 238 | for node in ast.walk(module): |
| 239 | if isinstance(node, ast.FunctionDef | ast.AsyncFunctionDef | ast.ClassDef): |
| 240 | raw = ast.get_docstring(node, clean=True) |
| 241 | result[node.lineno] = raw if raw else None |
| 242 | return result |
| 243 | |
| 244 | def _get_docstring( |
| 245 | root: pathlib.Path, |
| 246 | file_path: str, |
| 247 | lineno: int, |
| 248 | content_hash: str, |
| 249 | cache: _DocCache, |
| 250 | ) -> str | None: |
| 251 | """Return the docstring for the symbol at *lineno* in *file_path*. |
| 252 | |
| 253 | Uses *content_hash* as the cache key so different committed versions of |
| 254 | the same path are cached separately. |
| 255 | |
| 256 | Args: |
| 257 | root: Repository root directory. |
| 258 | file_path: Workspace-relative file path. |
| 259 | lineno: 1-based line number from :class:`SymbolRecord`. |
| 260 | content_hash: SHA-256 of the raw file bytes. |
| 261 | cache: Mutable per-invocation extraction cache. |
| 262 | """ |
| 263 | cache_key = (file_path, content_hash) |
| 264 | if cache_key not in cache: |
| 265 | raw: bytes | None = read_object(root, content_hash) |
| 266 | if raw is None: |
| 267 | full = root / file_path |
| 268 | raw = full.read_bytes() if full.is_file() else b"" |
| 269 | cache[cache_key] = _build_lineno_docstring_map(raw) |
| 270 | |
| 271 | raw_doc = cache[cache_key].get(lineno) |
| 272 | if not raw_doc: |
| 273 | return None |
| 274 | stripped = raw_doc.strip() |
| 275 | return stripped if stripped else None |
| 276 | |
| 277 | # --------------------------------------------------------------------------- |
| 278 | # Internal: signature extraction |
| 279 | # --------------------------------------------------------------------------- |
| 280 | |
| 281 | def _extract_signature(source: bytes, lineno: int, end_lineno: int) -> str: |
| 282 | """Return the first ``def``/``class``/``async def`` line of a symbol. |
| 283 | |
| 284 | Reads lines ``[lineno, end_lineno]`` (1-indexed) and returns the first |
| 285 | line that starts a definition, stripping leading whitespace. |
| 286 | Falls back to an empty string on any error. |
| 287 | """ |
| 288 | try: |
| 289 | text = source.decode("utf-8", errors="replace") |
| 290 | lines = text.splitlines() |
| 291 | for raw_line in lines[max(0, lineno - 1) : end_lineno]: |
| 292 | stripped = raw_line.lstrip() |
| 293 | if stripped.startswith(("def ", "async def ", "class ", "@")): |
| 294 | return stripped.rstrip() |
| 295 | if lineno <= len(lines): |
| 296 | return lines[lineno - 1].strip() |
| 297 | except Exception: |
| 298 | pass |
| 299 | return "" |
| 300 | |
| 301 | # --------------------------------------------------------------------------- |
| 302 | # Internal: health scoring |
| 303 | # --------------------------------------------------------------------------- |
| 304 | |
| 305 | def _compute_health( |
| 306 | docstring: str | None, |
| 307 | linked_tests: list[str], |
| 308 | since_version: str | None, |
| 309 | stale_info: StaleInfo, |
| 310 | ) -> tuple[float, list[DocHealthReason]]: |
| 311 | """Return ``(health_score, [reasons])`` for a symbol. |
| 312 | |
| 313 | Score breakdown: |
| 314 | +0.30 has a docstring |
| 315 | +0.20 docstring ≥ 40 chars |
| 316 | +0.20 has at least one linked test |
| 317 | +0.15 ``since_version`` is known |
| 318 | +0.15 not stale |
| 319 | """ |
| 320 | score = 0.0 |
| 321 | reasons: list[DocHealthReason] = [] |
| 322 | |
| 323 | if docstring: |
| 324 | score += 0.30 |
| 325 | if len(docstring) >= 40: |
| 326 | score += 0.20 |
| 327 | else: |
| 328 | reasons.append("docstring_too_short") |
| 329 | else: |
| 330 | reasons.append("no_docstring") |
| 331 | |
| 332 | if linked_tests: |
| 333 | score += 0.20 |
| 334 | else: |
| 335 | reasons.append("no_tests") |
| 336 | |
| 337 | if since_version is not None: |
| 338 | score += 0.15 |
| 339 | else: |
| 340 | reasons.append("no_version_annotation") |
| 341 | |
| 342 | if not stale_info["is_stale"]: |
| 343 | score += 0.15 |
| 344 | else: |
| 345 | reasons.append("stale_impl") |
| 346 | |
| 347 | return min(score, 1.0), reasons |
| 348 | |
| 349 | # --------------------------------------------------------------------------- |
| 350 | # Internal: test linkage via BFS |
| 351 | # --------------------------------------------------------------------------- |
| 352 | |
| 353 | _PY_TEST_PREFIXES: frozenset[str] = frozenset({"test_", "tests/"}) |
| 354 | |
| 355 | def _is_test_file(file_path: str) -> bool: |
| 356 | """Return ``True`` when *file_path* is a test file by convention.""" |
| 357 | import posixpath |
| 358 | stem = posixpath.basename(file_path) |
| 359 | return stem.startswith("test_") or "tests/" in file_path or "/test_" in file_path |
| 360 | |
| 361 | def _is_test_function(address: str, kind: SymbolKind) -> bool: |
| 362 | """Return ``True`` when *address* / *kind* identifies a test function.""" |
| 363 | if kind not in ("function", "method", "async_function", "async_method"): |
| 364 | return False |
| 365 | bare = address.rsplit("::", 1)[-1].rsplit(".", 1)[-1] |
| 366 | return bare.startswith("test_") or bare == "test" |
| 367 | |
| 368 | def build_symbol_test_map( |
| 369 | forward_graph: ForwardGraph, |
| 370 | all_symbols: SymbolIndex, |
| 371 | max_depth: int = 3, |
| 372 | ) -> NameAddrMap: |
| 373 | """Return a mapping from production symbol address → list[test_address]. |
| 374 | |
| 375 | Performs a BFS from every test function through the forward call graph |
| 376 | (caller → callees by bare name), accumulating which production symbols |
| 377 | each test reaches. The result is then inverted. |
| 378 | |
| 379 | Args: |
| 380 | forward_graph: Caller address → frozenset[bare callee name]. |
| 381 | all_symbols: All symbols in the snapshot. |
| 382 | max_depth: Maximum BFS depth. Default 3. |
| 383 | |
| 384 | Returns: |
| 385 | ``{production_address: [test_address, ...]}``, deduplicated and sorted. |
| 386 | """ |
| 387 | # Build name → addresses map for reverse lookup (bare name may be ambiguous). |
| 388 | name_to_addrs: NameAddrMap = {} |
| 389 | for addr, rec in all_symbols.items(): |
| 390 | name_to_addrs.setdefault(rec["name"], []).append(addr) |
| 391 | |
| 392 | # For each test function, BFS through forward graph. |
| 393 | # coverage: production_address → set of test addresses that reach it. |
| 394 | coverage: CoverageMap = {} |
| 395 | |
| 396 | for test_addr, rec in all_symbols.items(): |
| 397 | if not _is_test_function(test_addr, rec["kind"]): |
| 398 | continue |
| 399 | file_part = test_addr.split("::")[0] |
| 400 | if not _is_test_file(file_part): |
| 401 | continue |
| 402 | |
| 403 | # BFS from the test address using bare names as frontier nodes. |
| 404 | visited_names: set[str] = {rec["name"]} |
| 405 | q: deque[tuple[str, int]] = deque([(test_addr, 0)]) |
| 406 | |
| 407 | while q: |
| 408 | current_addr, depth = q.popleft() |
| 409 | if depth >= max_depth: |
| 410 | continue |
| 411 | for callee_name in forward_graph.get(current_addr, frozenset()): |
| 412 | if callee_name in visited_names: |
| 413 | continue |
| 414 | visited_names.add(callee_name) |
| 415 | for callee_addr in name_to_addrs.get(callee_name, []): |
| 416 | if callee_addr == test_addr: |
| 417 | continue |
| 418 | coverage.setdefault(callee_addr, set()).add(test_addr) |
| 419 | q.append((callee_addr, depth + 1)) |
| 420 | |
| 421 | return { |
| 422 | addr: sorted(tests) |
| 423 | for addr, tests in coverage.items() |
| 424 | } |
| 425 | |
| 426 | # --------------------------------------------------------------------------- |
| 427 | # Internal: callers resolution |
| 428 | # --------------------------------------------------------------------------- |
| 429 | |
| 430 | def _callers_for( |
| 431 | address: str, |
| 432 | reverse_graph: ReverseGraph, |
| 433 | all_symbols: SymbolIndex, |
| 434 | ) -> list[str]: |
| 435 | """Return addresses that directly call *address*, filtered to known symbols.""" |
| 436 | bare = address.rsplit("::", 1)[-1].rsplit(".", 1)[-1] if "::" in address else address |
| 437 | raw = reverse_graph.get(bare, []) |
| 438 | return sorted(c for c in raw if c in all_symbols) |
| 439 | |
| 440 | # --------------------------------------------------------------------------- |
| 441 | # Public API |
| 442 | # --------------------------------------------------------------------------- |
| 443 | |
| 444 | def build_symbol_doc( |
| 445 | root: pathlib.Path, |
| 446 | repo_id: str, |
| 447 | address: str, |
| 448 | record: SymbolRecord, |
| 449 | manifest: Manifest, |
| 450 | forward_graph: ForwardGraph, |
| 451 | reverse_graph: ReverseGraph, |
| 452 | all_symbols: SymbolIndex, |
| 453 | linked_tests: list[str], |
| 454 | doc_cache: _DocCache, |
| 455 | ) -> SymbolDoc: |
| 456 | """Assemble a complete :class:`SymbolDoc` for one symbol. |
| 457 | |
| 458 | Reads the committed source bytes, extracts the docstring and signature, |
| 459 | resolves version history and staleness from the index, and computes the |
| 460 | health score. |
| 461 | |
| 462 | Args: |
| 463 | root: Repository root directory. |
| 464 | repo_id: Repository content ID (for tag lookups). |
| 465 | address: Canonical symbol address. |
| 466 | record: :class:`SymbolRecord` from the committed snapshot. |
| 467 | manifest: Snapshot manifest: ``{file_path: sha256}``. |
| 468 | forward_graph: Caller → frozenset[callee_bare_name]. |
| 469 | reverse_graph: Callee bare name → [caller_address]. |
| 470 | all_symbols: All symbols in the snapshot. |
| 471 | linked_tests: Pre-computed test node IDs covering this symbol. |
| 472 | doc_cache: Per-invocation docstring extraction cache. |
| 473 | """ |
| 474 | file_path = address.split("::")[0] if "::" in address else address |
| 475 | content_hash = manifest.get(file_path, "") |
| 476 | |
| 477 | docstring = _get_docstring( |
| 478 | root, file_path, record["lineno"], content_hash, doc_cache |
| 479 | ) |
| 480 | |
| 481 | signature = "" |
| 482 | if content_hash: |
| 483 | raw_bytes: bytes | None = read_object(root, content_hash) |
| 484 | if raw_bytes is None: |
| 485 | full = root / file_path |
| 486 | if full.is_file(): |
| 487 | raw_bytes = full.read_bytes() |
| 488 | if raw_bytes is not None: |
| 489 | signature = _extract_signature( |
| 490 | raw_bytes, record["lineno"], record["end_lineno"] |
| 491 | ) |
| 492 | |
| 493 | events = get_symbol_version_events(root, repo_id, address) |
| 494 | since_version = infer_since_version(events) |
| 495 | last_changed_version = infer_last_changed_version(events) |
| 496 | since_commit = events[0]["commit_id"] if events else None |
| 497 | last_changed_commit = events[-1]["commit_id"] if events else None |
| 498 | |
| 499 | breaking_changes: list[str] = [] |
| 500 | seen_bc: set[str] = set() |
| 501 | for event in events: |
| 502 | commit = read_commit(root, event["commit_id"]) |
| 503 | if commit is not None: |
| 504 | for bc in commit.breaking_changes: |
| 505 | if bc not in seen_bc: |
| 506 | seen_bc.add(bc) |
| 507 | breaking_changes.append(bc) |
| 508 | |
| 509 | stale_info = detect_stale_docstring(root, address) |
| 510 | callers = _callers_for(address, reverse_graph, all_symbols) |
| 511 | callees = sorted(forward_graph.get(address, frozenset())) |
| 512 | health, reasons = _compute_health(docstring, linked_tests, since_version, stale_info) |
| 513 | |
| 514 | return SymbolDoc( |
| 515 | address=address, |
| 516 | name=record["name"], |
| 517 | qualified_name=record["qualified_name"], |
| 518 | kind=record["kind"], |
| 519 | file=file_path, |
| 520 | lineno=record["lineno"], |
| 521 | end_lineno=record["end_lineno"], |
| 522 | signature=signature, |
| 523 | docstring=docstring, |
| 524 | callers=callers, |
| 525 | callees=callees, |
| 526 | since_commit=since_commit, |
| 527 | since_version=since_version, |
| 528 | last_changed_commit=last_changed_commit, |
| 529 | last_changed_version=last_changed_version, |
| 530 | breaking_changes=breaking_changes, |
| 531 | linked_tests=linked_tests, |
| 532 | doc_health=round(health, 4), |
| 533 | doc_health_reasons=reasons, |
| 534 | ) |
| 535 | |
| 536 | def _is_public(name: str) -> bool: |
| 537 | """Return ``True`` when *name* does not begin with ``_``.""" |
| 538 | return not name.startswith("_") |
| 539 | |
| 540 | def extract_docs( |
| 541 | root: pathlib.Path, |
| 542 | repo_id: str, |
| 543 | targets: list[str] | None = None, |
| 544 | commit_id: str | None = None, |
| 545 | min_health: float | None = None, |
| 546 | max_depth: int = 3, |
| 547 | ) -> DocReport: |
| 548 | """Build a :class:`DocReport` for the repository or a subset of symbols. |
| 549 | |
| 550 | This is the primary entry point. It loads the symbol cache, builds the |
| 551 | call graph, resolves version history for every requested symbol, links |
| 552 | tests, and computes health scores — all from committed data. |
| 553 | |
| 554 | Args: |
| 555 | root: Repository root directory. |
| 556 | repo_id: Repository content ID. |
| 557 | targets: Optional list of symbol addresses or file paths to restrict |
| 558 | the report to. ``None`` means document the full snapshot. |
| 559 | commit_id: Specific commit to document. ``None`` uses HEAD. |
| 560 | min_health: When set, only include symbols with |
| 561 | ``doc_health < min_health`` in the output (useful for |
| 562 | ``--missing`` / ``--stale`` filter modes). |
| 563 | max_depth: Call-graph BFS depth for test-linkage resolution. |
| 564 | """ |
| 565 | generated_at = now_utc_iso() |
| 566 | |
| 567 | try: |
| 568 | branch = read_current_branch(root) |
| 569 | except ValueError: |
| 570 | branch = "main" |
| 571 | |
| 572 | if commit_id is None: |
| 573 | commit_id = get_head_commit_id(root, branch) or "" |
| 574 | |
| 575 | empty_summary = DocSummary( |
| 576 | total_symbols=0, |
| 577 | public_symbols=0, |
| 578 | documented=0, |
| 579 | undocumented=0, |
| 580 | stale_count=0, |
| 581 | avg_health=0.0, |
| 582 | doc_debt_score=1.0, |
| 583 | ) |
| 584 | if not commit_id: |
| 585 | return DocReport( |
| 586 | commit_id="", |
| 587 | generated_at=generated_at, |
| 588 | symbols=[], |
| 589 | missing=[], |
| 590 | stale=[], |
| 591 | summary=empty_summary, |
| 592 | ) |
| 593 | |
| 594 | raw_manifest = get_commit_snapshot_manifest(root, commit_id) |
| 595 | if raw_manifest is None: |
| 596 | return DocReport( |
| 597 | commit_id=commit_id, |
| 598 | generated_at=generated_at, |
| 599 | symbols=[], |
| 600 | missing=[], |
| 601 | stale=[], |
| 602 | summary=empty_summary, |
| 603 | ) |
| 604 | manifest: Manifest = raw_manifest |
| 605 | |
| 606 | cache = load_symbol_cache(root) |
| 607 | |
| 608 | all_symbols: SymbolIndex = {} |
| 609 | for file_tree in symbols_for_snapshot(root, manifest, cache=cache).values(): |
| 610 | all_symbols.update(file_tree) |
| 611 | |
| 612 | forward_graph = build_forward_graph(root, manifest, cache) |
| 613 | reverse_graph = build_reverse_graph(root, manifest, cache) |
| 614 | |
| 615 | # Pre-compute test linkage for all symbols. |
| 616 | test_map = build_symbol_test_map(forward_graph, all_symbols, max_depth) |
| 617 | |
| 618 | # Determine the set of addresses to document. |
| 619 | if targets: |
| 620 | target_set: set[str] = set() |
| 621 | for t in targets: |
| 622 | if "::" in t: |
| 623 | if t in all_symbols: |
| 624 | target_set.add(t) |
| 625 | else: |
| 626 | prefix = t if t.endswith("/") else f"{t}::" |
| 627 | bare = t |
| 628 | for addr in all_symbols: |
| 629 | if addr.startswith(prefix) or addr.split("::")[0] == bare: |
| 630 | target_set.add(addr) |
| 631 | addresses = sorted(target_set) |
| 632 | else: |
| 633 | addresses = sorted(all_symbols) |
| 634 | |
| 635 | doc_cache: _DocCache = {} |
| 636 | docs: list[SymbolDoc] = [] |
| 637 | missing: list[MissingDocEntry] = [] |
| 638 | stale_entries: list[StaleDocEntry] = [] |
| 639 | |
| 640 | for address in addresses: |
| 641 | record = all_symbols.get(address) |
| 642 | if record is None: |
| 643 | continue |
| 644 | |
| 645 | linked = test_map.get(address, []) |
| 646 | doc = build_symbol_doc( |
| 647 | root=root, |
| 648 | repo_id=repo_id, |
| 649 | address=address, |
| 650 | record=record, |
| 651 | manifest=manifest, |
| 652 | forward_graph=forward_graph, |
| 653 | reverse_graph=reverse_graph, |
| 654 | all_symbols=all_symbols, |
| 655 | linked_tests=linked, |
| 656 | doc_cache=doc_cache, |
| 657 | ) |
| 658 | |
| 659 | if min_health is not None and doc["doc_health"] >= min_health: |
| 660 | continue |
| 661 | |
| 662 | docs.append(doc) |
| 663 | |
| 664 | if _is_public(record["name"]) and doc["docstring"] is None: |
| 665 | missing.append( |
| 666 | MissingDocEntry( |
| 667 | address=address, |
| 668 | name=record["name"], |
| 669 | kind=record["kind"], |
| 670 | file=doc["file"], |
| 671 | caller_count=len(doc["callers"]), |
| 672 | ) |
| 673 | ) |
| 674 | |
| 675 | stale_info = detect_stale_docstring(root, address) |
| 676 | if stale_info["is_stale"]: |
| 677 | stale_entries.append( |
| 678 | StaleDocEntry( |
| 679 | address=address, |
| 680 | name=record["name"], |
| 681 | kind=record["kind"], |
| 682 | file=doc["file"], |
| 683 | last_doc_commit=stale_info["last_doc_commit"], |
| 684 | last_impl_commit=stale_info["last_impl_commit"], |
| 685 | signature_changed=stale_info["signature_changed"], |
| 686 | body_changed=stale_info["body_changed"], |
| 687 | ) |
| 688 | ) |
| 689 | |
| 690 | missing.sort(key=lambda e: e["caller_count"], reverse=True) |
| 691 | |
| 692 | total = len(docs) |
| 693 | public_count = sum(1 for d in docs if _is_public(d["name"])) |
| 694 | documented = sum(1 for d in docs if d["docstring"] is not None) |
| 695 | undocumented = sum( |
| 696 | 1 for d in docs if _is_public(d["name"]) and d["docstring"] is None |
| 697 | ) |
| 698 | stale_count = len(stale_entries) |
| 699 | avg_health = sum(d["doc_health"] for d in docs) / total if total else 0.0 |
| 700 | |
| 701 | debt_total = 0.0 |
| 702 | debt_weight = 0.0 |
| 703 | for d in docs: |
| 704 | if _is_public(d["name"]): |
| 705 | w = float(len(d["callers"]) + 1) |
| 706 | debt_total += (1.0 - d["doc_health"]) * w |
| 707 | debt_weight += w |
| 708 | doc_debt_score = (debt_total / debt_weight) if debt_weight > 0 else 0.0 |
| 709 | |
| 710 | return DocReport( |
| 711 | commit_id=commit_id, |
| 712 | generated_at=generated_at, |
| 713 | symbols=docs, |
| 714 | missing=missing, |
| 715 | stale=stale_entries, |
| 716 | summary=DocSummary( |
| 717 | total_symbols=total, |
| 718 | public_symbols=public_count, |
| 719 | documented=documented, |
| 720 | undocumented=undocumented, |
| 721 | stale_count=stale_count, |
| 722 | avg_health=round(avg_health, 4), |
| 723 | doc_debt_score=round(doc_debt_score, 4), |
| 724 | ), |
| 725 | ) |
File History
1 commit
sha256:b6cae4448122b2cc690d913be26f7e0a539f11855b8d288bd48be43eb532b5b2
refactor: migrate all source callers off muse.core.store re…
Sonnet 4.6
minor
⚠
29 days ago