"""Symbol-graph–driven test selection for ``muse code test``.

Given a set of changed symbol addresses (taken from ``muse diff`` or the
working-tree diff), this module identifies the minimal set of test functions
that exercise those symbols — without running a single test.

How it works
------------
1. Build the **forward call graph** for the committed snapshot (caller → callees).
2. For every *test function* in the graph, perform a BFS through the forward
   graph up to *depth* hops, accumulating the set of production symbols it
   transitively calls.
3. Invert the mapping: production symbol → list[test_node_ids].
4. For each changed symbol, look up which tests cover it.  Tests that cover
   *any* changed symbol are included in the result.

Why this is better than file-name heuristics
---------------------------------------------
File-name heuristics (``test_foo.py`` ↔ ``foo.py``) break the moment a
repository uses a non-obvious naming scheme, a shared test module, or a
parameterized fixture that exercises symbols from many files.  The call graph
knows exactly what each test calls — it does not guess.

Security
--------
This module is purely **read-only and static**.  It reads committed objects
from the content-addressed object store (SHA-256 verified blobs) and parses
them with Python's built-in ``ast`` module, which never executes code.
No working-tree file is written.  No subprocess is spawned.

Performance
-----------
Blobs are read once and cached by the ``SymbolCache``.  The call-graph BFS
is bounded by *depth* (default 3) and the size of the snapshot.  On a 400-
file Python codebase a warm-cache run completes in <50 ms.
"""

import logging
import pathlib
from collections.abc import Iterable
from typing import Literal, TypedDict

from muse.core.callgraph_cache import CallGraphCache, load_callgraph_cache
from muse.core.paths import muse_dir as _muse_dir
from muse.core.types import Manifest
from muse.core.symbol_cache import SymbolCache, load_symbol_cache

type SymbolIndex = dict[str, SymbolRecord]
type CoverageMap = dict[str, list[tuple[str, int]]]
type CounterMap = dict[str, int]
type TestListMap = dict[str, list[str]]
from muse.plugins.code._callgraph import (
    ForwardGraph,
    build_forward_graph,
)
from muse.plugins.code._query import is_semantic, symbols_for_snapshot
from muse.plugins.code.ast_parser import SymbolRecord

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Public type definitions
# ---------------------------------------------------------------------------

class ChangedSymbol(TypedDict):
    """A symbol that changed between two snapshots or in the working tree."""

    address: str
    """Fully-qualified symbol address: ``"path/to/file.py::FunctionName"``."""

    change_kind: Literal["modified", "added", "deleted"]
    """Whether the symbol body was modified, newly added, or removed."""

class SelectionTarget(TypedDict):
    """A single pytest-addressable test target to execute."""

    node_id: str
    """Pytest node ID, e.g. ``"tests/test_foo.py::TestBar::test_baz"``."""

    file: str
    """The test file path, e.g. ``"tests/test_foo.py"``."""

    reason: str
    """Human-readable explanation of why this test was selected."""

    confidence: float
    """Selection confidence in [0.0, 1.0]:

    * 1.0 — test directly calls the changed symbol.
    * 0.9 — test reaches changed symbol within depth ≤ 2.
    * 0.7 — test reaches changed symbol via depth 3+ hops.
    * 0.5 — test is in a file whose name matches the changed file's stem
              (file-name heuristic, fallback only).
    """

class SelectionResult(TypedDict):
    """Result of a test-selection pass."""

    changed_addresses: list[str]
    """Addresses of every changed symbol that was considered."""

    test_targets: list[SelectionTarget]
    """Ordered list of tests to run (deduplicated, highest confidence first)."""

    covered_addresses: list[str]
    """Subset of *changed_addresses* that have at least one covering test."""

    uncovered_addresses: list[str]
    """Changed symbols with no covering test — coverage gap alert."""

    coverage_fraction: float
    """``len(covered_addresses) / len(changed_addresses)`` in [0.0, 1.0]."""

    fallback_used: bool
    """True if file-name heuristics were used for any target (graph miss)."""

# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------

def _is_test_file(path: str) -> bool:
    """Return True if *path* is a test file by convention."""
    stem = pathlib.PurePosixPath(path).stem
    name = pathlib.PurePosixPath(path).name
    return (
        stem.startswith("test_")
        or stem.endswith("_test")
        or name == "conftest.py"
    )

def _is_test_function(address: str, kind: str) -> bool:
    """Return True if *address* refers to a test function or method."""
    parts = address.rsplit("::", 1)
    if len(parts) != 2:
        return False
    name = parts[1]
    return (
        kind in {"function", "method", "async_function", "async_method"}
        and (name.startswith("test_") or name == "conftest")
    )

def _confidence(depth: int) -> float:
    """Map BFS depth to a confidence score."""
    if depth <= 1:
        return 1.0
    if depth <= 2:
        return 0.9
    return 0.7

# ---------------------------------------------------------------------------
# Core selection algorithm
# ---------------------------------------------------------------------------

def _build_coverage_map(
    forward_graph: ForwardGraph,
    all_symbols: SymbolIndex,
    max_depth: int,
) -> CoverageMap:
    """Return mapping from production-symbol bare name → list[(test_node_id, depth)].

    Each value is a list of ``(test_address, bfs_depth)`` pairs.  The BFS
    starts from every test function and follows the *forward* call graph
    (caller → callees) up to *max_depth* hops.  At each hop we accumulate
    the production symbols reached.

    ``all_symbols`` is a flat mapping of ``address → SymbolRecord``
    used to look up the ``kind`` field.
    """
    coverage: CoverageMap = {}

    for addr, rec in all_symbols.items():
        kind = rec["kind"]
        if not _is_test_function(addr, kind):
            continue
        file_part = addr.split("::")[0]
        if not _is_test_file(file_part):
            continue

        # BFS from this test function through the forward call graph.
        bare_start = addr.rsplit("::", 1)[-1]
        frontier: list[tuple[str, int]] = [(bare_start, 0)]
        visited: set[str] = {bare_start}

        while frontier:
            current, depth = frontier.pop(0)
            if depth >= max_depth:
                continue
            for callee in forward_graph.get(current, frozenset()):
                if callee in visited:
                    continue
                visited.add(callee)
                reach_depth = depth + 1
                bucket = coverage.setdefault(callee, [])
                bucket.append((addr, reach_depth))
                frontier.append((callee, reach_depth))

    return coverage

def select_tests(
    root: pathlib.Path,
    changed: Iterable[ChangedSymbol],
    manifest: Manifest,
    *,
    depth: int = 3,
    cache: SymbolCache | None = None,
    callgraph_cache: CallGraphCache | None = None,
) -> SelectionResult:
    """Select the minimal test set that covers *changed* symbols.

    Args:
        root:              Repository root (locates the object store and caches).
        changed:           Iterable of :class:`ChangedSymbol` from ``muse diff``.
        manifest:          Snapshot manifest mapping ``file_path → sha256``.  Pass the
                           HEAD manifest to analyse the committed graph; pass the
                           working-tree manifest to include uncommitted edits.
        depth:             Maximum call-graph hops from a test function to a production
                           symbol.  Higher values yield more coverage but are slower.
                           Default 3.  Capped at 10 internally to bound BFS cost.
        cache:             Optional pre-loaded :class:`SymbolCache`.  When ``None`` the
                           cache is loaded from disk and saved on return.
        callgraph_cache:   Optional pre-loaded :class:`CallGraphCache`.  When ``None``
                           the cache is loaded from disk and saved after the graph is
                           built.  On a warm cache, ``build_forward_graph`` skips every
                           ``read_object`` / ``ast.parse`` / AST-walk — the primary
                           speedup lever.

    Returns:
        A :class:`SelectionResult` with deduplicated, confidence-sorted test
        targets and a coverage gap report.
    """
    changed_list = list(changed)
    changed_addresses = [c["address"] for c in changed_list]

    if not changed_addresses:
        return SelectionResult(
            changed_addresses=[],
            test_targets=[],
            covered_addresses=[],
            uncovered_addresses=[],
            coverage_fraction=1.0,
            fallback_used=False,
        )

    effective_depth = min(depth, 10)

    own_cache = cache is None
    active_cache: SymbolCache = cache if cache is not None else load_symbol_cache(root)

    own_cg_cache = callgraph_cache is None
    muse_dir = _muse_dir(root)
    active_cg_cache: CallGraphCache = (
        callgraph_cache if callgraph_cache is not None else load_callgraph_cache(root)
    )

    # --- Build the full symbol map (all files) ----------------------------
    all_trees = symbols_for_snapshot(root, manifest, cache=active_cache)

    # Flatten to address → SymbolRecord for kind lookups.
    flat_symbols: SymbolIndex = {}
    for _file_path, tree in all_trees.items():
        for addr, rec in tree.items():
            flat_symbols[addr] = rec

    # --- Build call graph -------------------------------------------------
    # The forward graph is keyed by *bare function name* because call-site
    # analysis via AST Name/Attribute nodes only sees the local name.
    # Passing callgraph_cache enables the fast path: warm-cache files skip
    # read_object + ast.parse + AST walk entirely.
    forward_graph = build_forward_graph(
        root, manifest, cache=active_cache, callgraph_cache=active_cg_cache
    )

    if own_cache:
        active_cache.save()
    if own_cg_cache:
        active_cg_cache.save()

    # --- Build coverage map -----------------------------------------------
    # coverage_map: bare_callee_name → [(test_addr, depth)]
    coverage_map = _build_coverage_map(forward_graph, flat_symbols, effective_depth)

    # --- Map changed addresses → tests ------------------------------------
    # best[(test_addr)] = min depth seen (lower is better)
    best: CounterMap = {}
    addr_to_tests: TestListMap = {}
    covered_set: set[str] = set()

    for changed_addr in changed_addresses:
        bare_name = changed_addr.rsplit("::", 1)[-1]
        hits = coverage_map.get(bare_name, [])
        if hits:
            covered_set.add(changed_addr)
        for test_addr, hit_depth in hits:
            addr_to_tests.setdefault(changed_addr, []).append(test_addr)
            if test_addr not in best or hit_depth < best[test_addr]:
                best[test_addr] = hit_depth

    # --- Fallback: file-name heuristic for uncovered symbols --------------
    fallback_used = False
    uncovered_before_fallback = set(changed_addresses) - covered_set

    if uncovered_before_fallback:
        # Build a map: production file stem → test files
        stem_to_test_files: TestListMap = {}
        for fp in manifest:
            if _is_test_file(fp):
                # A test file "tests/test_foo.py" covers the stem "foo"
                test_stem = pathlib.PurePosixPath(fp).stem
                for prefix in ("test_", ""):
                    if test_stem.startswith("test_"):
                        prod_stem = test_stem[len("test_"):]
                    else:
                        prod_stem = test_stem
                    stem_to_test_files.setdefault(prod_stem, []).append(fp)

        for changed_addr in uncovered_before_fallback:
            prod_file = changed_addr.split("::")[0]
            prod_stem = pathlib.PurePosixPath(prod_file).stem
            test_files = stem_to_test_files.get(prod_stem, [])
            if test_files:
                covered_set.add(changed_addr)
                fallback_used = True
                for tf in test_files:
                    # Use whole-file node ID for heuristic hits.
                    synthetic_addr = tf
                    if synthetic_addr not in best or best[synthetic_addr] > 99:
                        best[synthetic_addr] = 99  # heuristic sentinel depth

    # --- Build SelectionTarget list -------------------------------------------
    # Deduplicate by node_id, sort by confidence (ascending depth = higher conf)
    seen_node_ids: set[str] = set()
    targets: list[SelectionTarget] = []

    for test_addr, min_depth in sorted(best.items(), key=lambda kv: kv[1]):
        node_id = test_addr
        if node_id in seen_node_ids:
            continue
        seen_node_ids.add(node_id)

        file_path = test_addr.split("::")[0] if "::" in test_addr else test_addr
        is_heuristic = min_depth == 99

        if is_heuristic:
            confidence = 0.5
            reason = f"file-name match for changed symbol(s) in {file_path!r}"
        else:
            confidence = _confidence(min_depth)
            reason = (
                f"covers changed symbol(s) via call graph (depth {min_depth})"
            )

        targets.append(
            SelectionTarget(
                node_id=node_id,
                file=file_path,
                reason=reason,
                confidence=confidence,
            )
        )

    covered_addresses = sorted(covered_set)
    uncovered_addresses = sorted(set(changed_addresses) - covered_set)
    total = len(changed_addresses)
    coverage_fraction = len(covered_addresses) / total if total > 0 else 1.0

    logger.debug(
        "test_selection: %d changed symbols, %d tests selected, "
        "coverage %.0f%%",
        total,
        len(targets),
        coverage_fraction * 100,
    )

    return SelectionResult(
        changed_addresses=changed_addresses,
        test_targets=targets,
        covered_addresses=covered_addresses,
        uncovered_addresses=uncovered_addresses,
        coverage_fraction=coverage_fraction,
        fallback_used=fallback_used,
    )

# ---------------------------------------------------------------------------
# Convenience: diff the working tree and return changed symbols
# ---------------------------------------------------------------------------

def changed_symbols_from_diff(
    root: pathlib.Path,
    head_manifest: Manifest,
    *,
    cache: SymbolCache | None = None,
) -> list[ChangedSymbol]:
    """Return every symbol that differs between HEAD and the working tree.

    Compares the working-tree parse of every semantic file against the
    committed parse at HEAD.  Returns :class:`ChangedSymbol` records for
    every symbol that was added, modified (body or signature changed), or
    deleted.

    This function is the bridge between ``muse diff`` and test selection: it
    provides the *changed* list that ``select_tests`` needs.
    """
    own_cache = cache is None
    active_cache: SymbolCache = cache if cache is not None else load_symbol_cache(root)

    head_trees = symbols_for_snapshot(root, head_manifest, cache=active_cache)
    work_trees = symbols_for_snapshot(
        root, head_manifest, workdir=root, cache=active_cache
    )

    if own_cache:
        active_cache.save()

    result: list[ChangedSymbol] = []

    all_files: set[str] = set(head_trees) | set(work_trees)
    for file_path in sorted(all_files):
        if not is_semantic(file_path):
            continue
        head_tree = head_trees.get(file_path, {})
        work_tree = work_trees.get(file_path, {})

        all_addrs: set[str] = set(head_tree) | set(work_tree)
        for addr in all_addrs:
            head_rec = head_tree.get(addr)
            work_rec = work_tree.get(addr)

            if head_rec is None and work_rec is not None:
                result.append(ChangedSymbol(address=addr, change_kind="added"))
            elif head_rec is not None and work_rec is None:
                result.append(ChangedSymbol(address=addr, change_kind="deleted"))
            elif head_rec is not None and work_rec is not None:
                if head_rec["content_id"] != work_rec["content_id"]:
                    result.append(
                        ChangedSymbol(address=addr, change_kind="modified")
                    )

    return result