"""Unit and integration tests for ``muse.core.doc_extractor``. Coverage: - :func:`_build_lineno_docstring_map` with valid and invalid Python source. - :func:`_get_docstring` with object-store hits, file fallback, and caching. - :func:`_extract_signature` with functions, classes, and edge cases. - :func:`_compute_health` for each health dimension. - :func:`build_symbol_test_map` BFS logic. - :func:`extract_docs` integration with a synthetic repository. - :func:`_is_public` naming convention. - DocSummary aggregation (avg_health, debt_score, counts). """ from __future__ import annotations import ast import datetime import hashlib import pathlib import pytest from muse.core.doc_extractor import ( DocHealthReason, DocReport, DocSummary, MissingDocEntry, StaleDocEntry, SymbolDoc, _build_lineno_docstring_map, _compute_health, _extract_signature, _get_docstring, _is_public, build_symbol_test_map, extract_docs, ) from muse.core.doc_history import StaleInfo from muse.core.object_store import write_object from muse.core.ids import hash_commit as compute_commit_id, hash_snapshot as compute_snapshot_id from muse.core.commits import ( CommitRecord, write_commit, ) from muse.core.snapshots import ( SnapshotRecord, write_snapshot, ) from muse.plugins.code._callgraph import ForwardGraph, ReverseGraph from muse.plugins.code.ast_parser import SymbolKind, SymbolRecord from muse.core.types import Manifest, NULL_LONG_ID, blob_id, fake_id, long_id from muse.core.paths import heads_dir, muse_dir # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _make_repo(tmp_path: pathlib.Path) -> pathlib.Path: dot_muse = muse_dir(tmp_path) dot_muse.mkdir() (dot_muse / "repo.json").write_text('{"repo_id": "test-repo-123", "name": "test"}') refs = dot_muse / "refs" / "heads" refs.mkdir(parents=True) (dot_muse / "HEAD").write_text("ref: refs/heads/main\n") return tmp_path def _write_commit_with_snapshot( root: pathlib.Path, manifest: Manifest, ) -> str: snap_id = compute_snapshot_id(manifest) snap = SnapshotRecord(snapshot_id=snap_id, manifest=manifest) write_snapshot(root, snap) committed_at = datetime.datetime(2026, 1, 1, tzinfo=datetime.timezone.utc) commit_id = compute_commit_id( parent_ids=[], snapshot_id=snap_id, message="init", committed_at_iso=committed_at.isoformat(), author="test", ) commit = CommitRecord( commit_id=commit_id, branch="main", snapshot_id=snap_id, message="init", committed_at=committed_at, author="test", ) write_commit(root, commit) (heads_dir(root) / "main").write_text(commit_id) return commit_id def _make_sym( name: str, lineno: int = 1, end_lineno: int = 5, kind: SymbolKind = "function", ) -> SymbolRecord: return SymbolRecord( kind=kind, name=name, qualified_name=name, content_id=fake_id(name), body_hash=blob_id(b"body"), signature_id=blob_id(b"sig"), metadata_id="", canonical_key=f"f.py##{kind}#{name}#{lineno}", lineno=lineno, end_lineno=end_lineno, ) def _stale(is_stale: bool = False) -> StaleInfo: return StaleInfo( is_stale=is_stale, last_doc_commit=None, last_impl_commit=None, signature_changed=False, body_changed=False, ) # --------------------------------------------------------------------------- # Tests: _build_lineno_docstring_map # --------------------------------------------------------------------------- class TestBuildLinnoDocstringMap: def test_function_with_docstring(self) -> None: src = b'def foo():\n """My docstring."""\n pass\n' m = _build_lineno_docstring_map(src) assert m.get(1) == "My docstring." def test_function_without_docstring(self) -> None: src = b"def foo():\n pass\n" m = _build_lineno_docstring_map(src) assert m.get(1) is None def test_class_with_docstring(self) -> None: src = b'class Foo:\n """Class doc."""\n pass\n' m = _build_lineno_docstring_map(src) assert m.get(1) == "Class doc." def test_nested_method_lineno(self) -> None: src = ( b"class Foo:\n" b" def bar(self):\n" b' """Bar doc."""\n' b" pass\n" ) m = _build_lineno_docstring_map(src) assert m.get(2) == "Bar doc." def test_syntax_error_returns_empty(self) -> None: src = b"def foo(:\n pass\n" m = _build_lineno_docstring_map(src) assert m == {} def test_multiline_docstring(self) -> None: src = ( b'def foo():\n' b' """First line.\n' b'\n' b' Second paragraph.\n' b' """\n' b' pass\n' ) m = _build_lineno_docstring_map(src) doc = m.get(1) assert doc is not None assert "First line" in doc assert "Second paragraph" in doc # --------------------------------------------------------------------------- # Tests: _get_docstring # --------------------------------------------------------------------------- class TestGetDocstring: def test_from_object_store(self, tmp_path: pathlib.Path) -> None: root = _make_repo(tmp_path) src = b'def foo():\n """Object store doc."""\n pass\n' content_hash = blob_id(src) write_object(root, content_hash, src) cache: dict[tuple[str, str], dict[int, str | None]] = {} result = _get_docstring(root, "foo.py", 1, content_hash, cache) assert result == "Object store doc." # Cache should be populated. assert ("foo.py", content_hash) in cache def test_from_file_fallback(self, tmp_path: pathlib.Path) -> None: root = _make_repo(tmp_path) src = b'def bar():\n """File fallback doc."""\n pass\n' (tmp_path / "bar.py").write_bytes(src) # Use a fake hash so object store misses. fake_hash = NULL_LONG_ID cache: dict[tuple[str, str], dict[int, str | None]] = {} result = _get_docstring(root, "bar.py", 1, fake_hash, cache) assert result == "File fallback doc." def test_no_docstring_returns_none(self, tmp_path: pathlib.Path) -> None: root = _make_repo(tmp_path) src = b"def baz():\n pass\n" h = blob_id(src) write_object(root, h, src) cache: dict[tuple[str, str], dict[int, str | None]] = {} result = _get_docstring(root, "baz.py", 1, h, cache) assert result is None def test_cache_avoids_reparse(self, tmp_path: pathlib.Path) -> None: """Once a (file, hash) is in cache, subsequent calls return the cached result.""" root = _make_repo(tmp_path) src = b'def fn():\n """Cached doc."""\n pass\n' h = blob_id(src) write_object(root, h, src) cache: dict[tuple[str, str], dict[int, str | None]] = {} # First call — populates cache. result1 = _get_docstring(root, "fn.py", 1, h, cache) assert result1 == "Cached doc." # Manually corrupt the cached map to verify cache is used on second call. cache[("fn.py", h)][1] = "INJECTED" result2 = _get_docstring(root, "fn.py", 1, h, cache) assert result2 == "INJECTED" # cache hit — object store not re-read # --------------------------------------------------------------------------- # Tests: _extract_signature # --------------------------------------------------------------------------- class TestExtractSignature: def test_function(self) -> None: src = b"def my_func(x: int) -> str:\n return str(x)\n" sig = _extract_signature(src, 1, 2) assert "def my_func" in sig def test_class(self) -> None: src = b"class MyClass(Base):\n pass\n" sig = _extract_signature(src, 1, 2) assert "class MyClass" in sig def test_async_function(self) -> None: src = b"async def fetch():\n pass\n" sig = _extract_signature(src, 1, 2) assert "async def fetch" in sig def test_decorator_skipped(self) -> None: src = b"@property\ndef value(self) -> int:\n return 0\n" sig = _extract_signature(src, 1, 3) # The first line is a decorator — should still return something. assert sig # non-empty def test_out_of_range_fallback(self) -> None: src = b"x = 1\n" sig = _extract_signature(src, 100, 110) assert sig == "" # --------------------------------------------------------------------------- # Tests: _compute_health # --------------------------------------------------------------------------- class TestComputeHealth: def test_all_zero(self) -> None: # No docstring = 0, no tests = 0, no version = 0, not stale = +0.15 score, reasons = _compute_health(None, [], None, _stale(False)) assert score == pytest.approx(0.15) assert "no_docstring" in reasons assert "no_tests" in reasons assert "no_version_annotation" in reasons assert "stale_impl" not in reasons def test_stale_penalty(self) -> None: score, reasons = _compute_health(None, [], None, _stale(True)) assert score == pytest.approx(0.0) assert "stale_impl" in reasons def test_full_score(self) -> None: long_doc = "A" * 50 score, reasons = _compute_health(long_doc, ["test1"], "v1.0", _stale(False)) assert score == pytest.approx(1.0) assert reasons == [] def test_short_docstring_penalty(self) -> None: short_doc = "Short." # < 40 chars score, reasons = _compute_health(short_doc, ["t1"], "v1.0", _stale(False)) # has doc: 0.30, short: no +0.20, has test: 0.20, has version: 0.15, not stale: 0.15 assert score == pytest.approx(0.80) assert "docstring_too_short" in reasons def test_capped_at_one(self) -> None: long_doc = "A" * 100 score, _ = _compute_health(long_doc, ["t1", "t2"], "v1.0", _stale(False)) assert score <= 1.0 def test_no_tests(self) -> None: long_doc = "A" * 50 score, reasons = _compute_health(long_doc, [], "v1.0", _stale(False)) # 0.30 + 0.20 (long) + 0 (no tests) + 0.15 (version) + 0.15 (not stale) = 0.80 assert score == pytest.approx(0.80) assert "no_tests" in reasons # --------------------------------------------------------------------------- # Tests: build_symbol_test_map # --------------------------------------------------------------------------- class TestBuildSymbolTestMap: def test_empty_symbols(self) -> None: result = build_symbol_test_map({}, {}) assert result == {} def test_test_not_linked_to_non_test(self) -> None: """Test functions should not appear as callers of themselves.""" sym: SymbolRecord = _make_sym("test_foo", kind="function") all_syms = {"tests/test_a.py::test_foo": sym} fg: ForwardGraph = {"tests/test_a.py::test_foo": frozenset({"bar"})} result = build_symbol_test_map(fg, all_syms) # "bar" is in callees but has no SymbolRecord — map should be empty. assert result == {} def test_single_test_links_to_production(self) -> None: test_sym: SymbolRecord = _make_sym("test_foo", kind="function") prod_sym: SymbolRecord = _make_sym("bar", kind="function") all_syms = { "tests/test_a.py::test_foo": test_sym, "muse/core/a.py::bar": prod_sym, } fg: ForwardGraph = { "tests/test_a.py::test_foo": frozenset({"bar"}), "muse/core/a.py::bar": frozenset(), } result = build_symbol_test_map(fg, all_syms) assert "muse/core/a.py::bar" in result assert "tests/test_a.py::test_foo" in result["muse/core/a.py::bar"] def test_depth_limit_respected(self) -> None: """BFS stops at max_depth hops.""" all_syms = { "tests/t.py::test_x": _make_sym("test_x", kind="function"), "a.py::a": _make_sym("a", kind="function"), "b.py::b": _make_sym("b", kind="function"), "c.py::c": _make_sym("c", kind="function"), "d.py::d": _make_sym("d", kind="function"), } fg: ForwardGraph = { "tests/t.py::test_x": frozenset({"a"}), "a.py::a": frozenset({"b"}), "b.py::b": frozenset({"c"}), "c.py::c": frozenset({"d"}), } # max_depth=2 → test_x → a (depth 1) → b (depth 2), stop. result = build_symbol_test_map(fg, all_syms, max_depth=2) assert "a.py::a" in result assert "b.py::b" in result assert "c.py::c" not in result assert "d.py::d" not in result def test_no_infinite_loop(self) -> None: """Cyclic call graph does not cause infinite loop.""" all_syms = { "tests/t.py::test_cycle": _make_sym("test_cycle", kind="function"), "a.py::alpha": _make_sym("alpha", kind="function"), "b.py::beta": _make_sym("beta", kind="function"), } fg: ForwardGraph = { "tests/t.py::test_cycle": frozenset({"alpha"}), "a.py::alpha": frozenset({"beta"}), "b.py::beta": frozenset({"alpha"}), # cycle } result = build_symbol_test_map(fg, all_syms) # Should complete without recursion limit. assert isinstance(result, dict) # --------------------------------------------------------------------------- # Tests: _is_public # --------------------------------------------------------------------------- class TestIsPublic: def test_public_name(self) -> None: assert _is_public("my_function") is True def test_private_name(self) -> None: assert _is_public("_private") is False def test_dunder(self) -> None: assert _is_public("__init__") is False def test_empty_string(self) -> None: assert _is_public("") is True # --------------------------------------------------------------------------- # Tests: extract_docs integration # --------------------------------------------------------------------------- class TestExtractDocs: def test_empty_repo_no_commit(self, tmp_path: pathlib.Path) -> None: """When there's no HEAD commit, returns an empty report.""" root = _make_repo(tmp_path) report = extract_docs(root, "test-repo-123") assert report["commit_id"] == "" assert report["symbols"] == [] assert report["summary"]["total_symbols"] == 0 def test_repo_with_python_file(self, tmp_path: pathlib.Path) -> None: """A repo with one documented Python file produces at least one SymbolDoc.""" root = _make_repo(tmp_path) src = ( b"def documented_fn(x: int) -> str:\n" b' """Return x as a string."""\n' b" return str(x)\n" ) content_hash = blob_id(src) write_object(root, content_hash, src) (tmp_path / "documented.py").write_bytes(src) manifest = {"documented.py": content_hash} _write_commit_with_snapshot(root, manifest) report = extract_docs(root, "test-repo-123") assert report["summary"]["total_symbols"] >= 1 addrs = [d["address"] for d in report["symbols"]] assert any("documented_fn" in a for a in addrs) def test_missing_list_populated(self, tmp_path: pathlib.Path) -> None: """Public functions without docstrings appear in 'missing'.""" root = _make_repo(tmp_path) src = b"def undocumented() -> None:\n pass\n" h = blob_id(src) write_object(root, h, src) (tmp_path / "nodoc.py").write_bytes(src) manifest = {"nodoc.py": h} _write_commit_with_snapshot(root, manifest) report = extract_docs(root, "test-repo-123") missing_addrs = [m["address"] for m in report["missing"]] assert any("undocumented" in a for a in missing_addrs) def test_targets_filter(self, tmp_path: pathlib.Path) -> None: """When targets is set, only those symbols appear in the report.""" root = _make_repo(tmp_path) src = ( b"def alpha() -> None:\n pass\n" b"def beta() -> None:\n pass\n" ) h = blob_id(src) write_object(root, h, src) (tmp_path / "ab.py").write_bytes(src) manifest = {"ab.py": h} _write_commit_with_snapshot(root, manifest) # Find the alpha address. full_report = extract_docs(root, "test-repo-123") alpha_addrs = [ d["address"] for d in full_report["symbols"] if "alpha" in d["address"] ] if not alpha_addrs: pytest.skip("alpha not found in snapshot — symbol cache not populated") targeted = extract_docs(root, "test-repo-123", targets=[alpha_addrs[0]]) addrs = [d["address"] for d in targeted["symbols"]] assert any("alpha" in a for a in addrs) assert not any("beta" in a for a in addrs) def test_summary_aggregation(self, tmp_path: pathlib.Path) -> None: """DocSummary counts are consistent with the symbols list.""" root = _make_repo(tmp_path) src = ( b"def with_doc():\n" b' """Has doc."""\n' b" pass\n" b"def without_doc():\n" b" pass\n" ) h = blob_id(src) write_object(root, h, src) (tmp_path / "mixed.py").write_bytes(src) manifest = {"mixed.py": h} _write_commit_with_snapshot(root, manifest) report = extract_docs(root, "test-repo-123") s = report["summary"] assert s["total_symbols"] == len(report["symbols"]) assert s["documented"] + s["undocumented"] <= s["total_symbols"] assert 0.0 <= s["avg_health"] <= 1.0 assert 0.0 <= s["doc_debt_score"] <= 1.0 def test_at_commit_param(self, tmp_path: pathlib.Path) -> None: """Passing commit_id uses that commit rather than HEAD.""" root = _make_repo(tmp_path) src = b"def fn():\n pass\n" h = blob_id(src) write_object(root, h, src) (tmp_path / "fn.py").write_bytes(src) manifest = {"fn.py": h} cid = _write_commit_with_snapshot(root, manifest) report = extract_docs(root, "test-repo-123", commit_id=cid) assert report["commit_id"] == cid def test_invalid_commit_returns_empty(self, tmp_path: pathlib.Path) -> None: """An unknown commit_id returns an empty report, not an error.""" root = _make_repo(tmp_path) report = extract_docs(root, "test-repo-123", commit_id=NULL_LONG_ID) assert report["symbols"] == [] # --------------------------------------------------------------------------- # Stress tests # --------------------------------------------------------------------------- class TestExtractDocsStress: def test_many_symbols(self, tmp_path: pathlib.Path) -> None: """extract_docs handles a file with 100 functions without crashing.""" root = _make_repo(tmp_path) lines: list[str] = [] for i in range(100): lines.append(f'def fn_{i}(x: int) -> int:') lines.append(f' """Function {i} — does something useful."""') lines.append(f' return x + {i}') lines.append("") src = "\n".join(lines).encode() h = blob_id(src) write_object(root, h, src) (tmp_path / "big.py").write_bytes(src) manifest = {"big.py": h} _write_commit_with_snapshot(root, manifest) report = extract_docs(root, "test-repo-123") assert report["summary"]["total_symbols"] >= 50 # at least most parsed