gabriel / muse public
codemap.py python
426 lines 15.1 KB
Raw
sha256:be3641f35bdbcc094677776a77b9aa6a5dab891f8fab201dc162d03c2bab5aea fix(read): strip position:null from structured_delta ops in… Sonnet 4.6 patch 23 days ago
1 """muse code codemap — repository semantic topology.
2
3 Generates a structural map of the codebase from committed snapshot data:
4
5 * **Modules ranked by size** — symbol count and lines of code per file
6 * **Import in-degree** — how many other files import each module
7 * **Import cycles** — circular dependency chains detected via iterative DFS
8 * **High-centrality symbols** — functions called from the most callers
9 * **Boundary files** — high fan-out (imports many) but low fan-in (few import it)
10 * **Agent-safe zones** — completely isolated files with no import coupling
11
12 This is a semantic topology view, not a file-system listing. It reveals the
13 actual shape of a codebase — where the load-bearing columns are, where the
14 cycles hide, and where parallel agents can safely work without collision.
15
16 Usage::
17
18 muse code codemap
19 muse code codemap --commit HEAD~10
20 muse code codemap --language Python
21 muse code codemap --top 20
22 muse code codemap --min-importers 2
23 muse code codemap --json
24
25 Output::
26
27 Semantic codemap — commit a1b2c3d4
28
29 Top modules by size:
30 src/billing.py 42 symbols (12 importers) ⬛ HIGH CENTRALITY
31 src/models.py 31 symbols (8 importers)
32 src/auth.py 18 symbols (5 importers)
33
34 Import cycles (2):
35 src/billing.py → src/utils.py → src/billing.py
36 src/api.py → src/auth.py → src/api.py
37
38 High-centrality symbols (most callers):
39 src/billing.py::compute_total 14 callers
40 src/auth.py::validate_token 9 callers
41
42 Boundary files (high fan-out, low fan-in):
43 src/cli.py imports 8 modules ← imported by 0
44
45 Agent-safe zones (no import coupling — safe for parallel work):
46 src/utils.py
47 src/constants.py
48
49 Flags:
50
51 ``--commit, -c REF``
52 Analyse a historical snapshot instead of HEAD.
53
54 ``--language LANG``
55 Restrict analysis to files of this language.
56
57 ``--top N``
58 Show top N entries in each section (default: 15, must be ≥ 1).
59
60 ``--min-importers N``
61 Only include modules imported by at least N other files in the ranked
62 module list (default: 0 = show all).
63
64 ``--json``
65 Emit the full codemap as JSON.
66 """
67
68 import argparse
69 import json
70 import logging
71 import pathlib
72 from typing import TypedDict
73
74 from muse.core.envelope import EnvelopeJson, make_envelope
75 from muse.core.errors import ExitCode
76 from muse.core.repo import require_repo
77 from muse.core.types import Manifest
78 from muse.core.refs import read_current_branch
79 from muse.core.commits import resolve_commit_ref
80 from muse.core.snapshots import get_commit_snapshot_manifest
81 from muse.core.symbol_cache import load_symbol_cache
82 from muse.core.timing import start_timer
83 from muse.plugins.code._callgraph import build_reverse_graph
84 from muse.plugins.code._query import symbols_for_snapshot
85 from muse.plugins.code.ast_parser import SymbolTree
86 from muse.core.validation import clamp_int, sanitize_display
87
88 type _SymbolTreeMap = dict[str, SymbolTree]
89 type _ImportOut = dict[str, list[str]]
90 type _CounterMap = dict[str, int]
91
92 logger = logging.getLogger(__name__)
93
94 # ---------------------------------------------------------------------------
95 # Typed output shape
96 # ---------------------------------------------------------------------------
97
98 class _ModuleEntry(TypedDict):
99 file: str
100 symbol_count: int
101 importers: int
102 imports: int
103
104 class _CentralityEntry(TypedDict):
105 name: str
106 callers: int
107
108 class _BoundaryEntry(TypedDict):
109 file: str
110 fan_out: int
111 fan_in: int
112
113 class _CodemapOutputJson(EnvelopeJson):
114 """Top-level JSON envelope emitted by ``muse code codemap --json``.
115
116 Fields
117 ------
118 schema_version Muse version string.
119 commit Short commit ID that was analysed.
120 branch Current branch name.
121 language_filter Language filter applied, or None.
122 modules Files ranked by symbol count.
123 import_cycles Detected circular dependency chains.
124 high_centrality Symbols with the most callers.
125 boundary_files Files with high fan-out and zero fan-in.
126 agent_safe_zones Files with no import coupling (safe for parallel work).
127 exit_code Always 0 — errors raise SystemExit before JSON emits.
128 duration_ms Wall-clock time for the full analysis in milliseconds.
129 """
130
131 commit: str
132 branch: str
133 language_filter: str | None
134 modules: list[_ModuleEntry]
135 import_cycles: list[list[str]]
136 high_centrality: list[_CentralityEntry]
137 boundary_files: list[_BoundaryEntry]
138 agent_safe_zones: list[str]
139
140 def _build_import_graph(
141 sym_map: _SymbolTreeMap,
142 ) -> tuple[_ImportOut, _CounterMap]:
143 """Return ``(imports_out, in_degree)`` for the files in *sym_map*.
144
145 Builds a stem-based heuristic import graph: for each import symbol in
146 each file, check whether the imported module stem matches a known file in
147 the map. Edges are deduplicated so a file importing the same module via
148 multiple ``from X import a, b`` statements counts as one edge.
149
150 Args:
151 sym_map: Pre-parsed symbol trees, keyed by file path. Already
152 filtered by language when the caller applies a filter.
153
154 Returns:
155 ``imports_out`` — adjacency list: file → list of files it imports.
156 ``in_degree`` — import fan-in count per file.
157 """
158 stem_to_file: Manifest = {
159 pathlib.PurePosixPath(fp).stem: fp for fp in sym_map
160 }
161
162 imports_out: _ImportOut = {fp: [] for fp in sym_map}
163 in_degree: _CounterMap = {fp: 0 for fp in sym_map}
164
165 for file_path, tree in sym_map.items():
166 seen_targets: set[str] = set()
167 for rec in tree.values():
168 if rec["kind"] != "import":
169 continue
170 # rec["name"] is the bare module name (e.g. "utils" for both
171 # `import utils` and `from utils import X`).
172 target = stem_to_file.get(rec["name"])
173 if target and target != file_path and target not in seen_targets:
174 seen_targets.add(target)
175 imports_out[file_path].append(target)
176 in_degree[target] += 1
177
178 return imports_out, in_degree
179
180 def _find_cycles(imports_out: _ImportOut) -> list[list[str]]:
181 """Detect import cycles via iterative DFS. Returns cycle paths.
182
183 Uses an explicit stack instead of recursion so that deeply nested import
184 graphs (thousands of files in a chain) cannot exhaust Python's call stack.
185 O(V+E) — every node is visited at most once globally.
186
187 The ``in_stack`` dict maps each node on the current DFS path to its index
188 in ``path``, giving O(1) lookups for both cycle detection and cycle
189 extraction (replacing the previous O(N) ``path.index()`` call).
190 """
191 cycles: list[list[str]] = []
192 visited: set[str] = set()
193
194 for start in imports_out:
195 if start in visited:
196 continue
197 # Stack frame: (node, path-so-far, in_stack: node→index-in-path)
198 stack: list[tuple[str, list[str], dict[str, int]]] = [(start, [], {})]
199 while stack:
200 node, path, in_stack = stack.pop()
201 if node in in_stack:
202 idx = in_stack[node]
203 cycles.append(path[idx:] + [node])
204 continue
205 if node in visited:
206 continue
207 visited.add(node)
208 new_in_stack = {**in_stack, node: len(path)}
209 new_path = path + [node]
210 for neighbour in imports_out.get(node, []):
211 stack.append((neighbour, new_path, new_in_stack))
212
213 return cycles
214
215 def register(subparsers: "argparse._SubParsersAction[argparse.ArgumentParser]") -> None:
216 """Register the codemap subcommand."""
217 parser = subparsers.add_parser(
218 "codemap",
219 help="Generate a semantic topology map of the repository.",
220 description=__doc__,
221 formatter_class=argparse.RawDescriptionHelpFormatter,
222 )
223 parser.add_argument(
224 "--commit", "-c", default=None, metavar="REF", dest="ref",
225 help="Analyse this commit instead of HEAD.",
226 )
227 parser.add_argument(
228 "--language", "-l", default=None, metavar="LANG", dest="language",
229 help="Restrict analysis to this language.",
230 )
231 parser.add_argument(
232 "--top", "-n", type=int, default=15, metavar="N", dest="top",
233 help="Number of entries to show in each ranked section (must be ≥ 1).",
234 )
235 parser.add_argument(
236 "--min-importers", type=int, default=0, metavar="N", dest="min_importers",
237 help="Only show modules imported by at least N other files (default: 0 = all).",
238 )
239 parser.add_argument(
240 "--json", "-j", action="store_true", dest="json_out",
241 help="Emit results as JSON.",
242 )
243 parser.set_defaults(func=run)
244
245 def run(args: argparse.Namespace) -> None:
246 """Generate a semantic topology map of the repository.
247
248 Ranks modules by size, detects import cycles, finds high-centrality symbols,
249 identifies boundary files (high fan-out, low fan-in), and surfaces
250 agent-safe zones — files with no import coupling safe for parallel work.
251 All analysis runs from the committed snapshot; the working tree is never read.
252
253 Agent quickstart
254 ----------------
255 ::
256
257 muse code codemap --json
258 muse code codemap --language Python --top 20 --json
259 muse code codemap --ref dev --json
260
261 JSON fields
262 -----------
263 ref Commit ref analysed.
264 language_filter Language filter applied (``null`` if none).
265 modules List of module objects ranked by size: ``file``,
266 ``symbol_count``, ``importers``, ``imports``.
267 cycles List of import cycle groups (each a list of file paths).
268 high_centrality List of symbol addresses with the highest centrality.
269 boundary_files Files with high fan-out and low fan-in.
270 agent_safe Files with no import coupling (safe for parallel work).
271 exit_code Integer exit code (see below).
272
273 Exit codes
274 ----------
275 0 Analysis complete.
276 1 Invalid arguments or commit not found.
277 2 Not inside a Muse repository.
278 """
279 elapsed = start_timer()
280 ref: str | None = args.ref
281 language: str | None = args.language
282 top: int = clamp_int(args.top, 1, 10_000, 'top')
283 min_importers: int = clamp_int(args.min_importers, 0, 10000, 'min_importers')
284 json_out: bool = args.json_out
285
286 if top < 1:
287 logger.error("--top must be at least 1, got %d", top)
288 raise SystemExit(ExitCode.USER_ERROR)
289 if min_importers < 0:
290 logger.error("--min-importers must be non-negative, got %d", min_importers)
291 raise SystemExit(ExitCode.USER_ERROR)
292
293 root = require_repo()
294
295 branch = read_current_branch(root)
296 commit = resolve_commit_ref(root, branch, ref)
297 if commit is None:
298 logger.error("Commit %r not found.", ref or "HEAD")
299 raise SystemExit(ExitCode.USER_ERROR)
300
301 manifest = get_commit_snapshot_manifest(root, commit.commit_id) or {}
302
303 # Single shared cache — each blob is parsed at most once across all passes.
304 cache = load_symbol_cache(root)
305
306 sym_map = symbols_for_snapshot(root, manifest, language_filter=language, cache=cache)
307
308 if language and not sym_map:
309 logger.warning("No files matched language filter %r — output will be empty.", language)
310
311 file_sym_counts: _CounterMap = {fp: len(tree) for fp, tree in sym_map.items()}
312
313 # Import graph built from the pre-parsed sym_map (no second parse pass).
314 imports_out, in_degree = _build_import_graph(sym_map)
315
316 cycles = _find_cycles(imports_out)
317
318 # Call graph uses the shared cache (no re-parse of Python blobs).
319 reverse = build_reverse_graph(root, manifest, cache=cache)
320 centrality: list[tuple[str, int]] = sorted(
321 ((name, len(callers)) for name, callers in reverse.items()),
322 key=lambda t: t[1],
323 reverse=True,
324 )[:top]
325
326 # Boundary files: imports many but is imported by few.
327 fan_out = {fp: len(targets) for fp, targets in imports_out.items() if targets}
328 boundaries: list[tuple[str, int, int]] = sorted(
329 [
330 (fp, fan_out.get(fp, 0), in_degree.get(fp, 0))
331 for fp in sym_map
332 if fan_out.get(fp, 0) >= 3 and in_degree.get(fp, 0) == 0
333 ],
334 key=lambda t: t[1],
335 reverse=True,
336 )[:top]
337
338 # Agent-safe zones: completely decoupled files (no imports in or out).
339 agent_safe: list[str] = sorted(
340 fp for fp in sym_map
341 if in_degree.get(fp, 0) == 0 and not imports_out.get(fp)
342 )[:top]
343
344 # Ranked modules — optionally filtered by --min-importers.
345 ranked: list[tuple[str, int]] = sorted(
346 (
347 (fp, cnt) for fp, cnt in file_sym_counts.items()
348 if in_degree.get(fp, 0) >= min_importers
349 ),
350 key=lambda t: t[1],
351 reverse=True,
352 )[:top]
353
354 if json_out:
355 print(json.dumps(_CodemapOutputJson(
356 **make_envelope(elapsed),
357 commit=commit.commit_id,
358 branch=branch,
359 language_filter=language,
360 modules=[
361 _ModuleEntry(
362 file=fp,
363 symbol_count=cnt,
364 importers=in_degree.get(fp, 0),
365 imports=len(imports_out.get(fp, [])),
366 )
367 for fp, cnt in ranked
368 ],
369 import_cycles=cycles,
370 high_centrality=[
371 _CentralityEntry(name=name, callers=cnt)
372 for name, cnt in centrality
373 ],
374 boundary_files=[
375 _BoundaryEntry(file=fp, fan_out=fo, fan_in=fi)
376 for fp, fo, fi in boundaries
377 ],
378 agent_safe_zones=agent_safe,
379 )))
380 return
381
382 print(f"\nSemantic codemap — commit {commit.commit_id}")
383 if language:
384 print(f" (language: {language})")
385 if min_importers:
386 print(f" (min-importers: {min_importers})")
387 print("─" * 62)
388
389 print(f"\nTop modules by size (top {min(top, len(ranked))}):")
390 if ranked:
391 max_fp = max(len(fp) for fp, _ in ranked)
392 for fp, cnt in ranked:
393 imp = in_degree.get(fp, 0)
394 imp_label = f"({imp} importers)" if imp else "(not imported)"
395 print(f" {sanitize_display(fp):<{max_fp}} {cnt:>3} symbols {imp_label}")
396 else:
397 print(" (no files match the current filters)")
398
399 print(f"\nImport cycles ({len(cycles)}):")
400 if cycles:
401 for cycle in cycles[:top]:
402 print(f" {' → '.join(cycle)}")
403 else:
404 print(" ✅ No import cycles detected")
405
406 print(f"\nHigh-centrality symbols — most callers (Python):")
407 if centrality:
408 for name, cnt in centrality:
409 print(f" {sanitize_display(name):<40} {cnt} caller(s)")
410 else:
411 print(" (no Python call graph available)")
412
413 print(f"\nBoundary files — high fan-out, zero fan-in:")
414 if boundaries:
415 for fp, fo, fi in boundaries:
416 print(f" {sanitize_display(fp)} imports {fo} ← imported by {fi}")
417 else:
418 print(" (none detected)")
419
420 print(f"\nAgent-safe zones — no import coupling ({len(agent_safe)}):")
421 if agent_safe:
422 for fp in agent_safe:
423 print(f" {sanitize_display(fp)}")
424 else:
425 print(" (all files are coupled via imports)")
426
File History 3 commits
sha256:36c3cb3e76619d4c30a6d9bf81b5ec4ff148e30dcfed913e3114ca7b43b81c7e fix: rename objects→blobs in push client and all stale test… Sonnet 4.6 patch 22 days ago
sha256:c06a9b9b9fee26c68ea725b44d54b2c0a171301ce9de746d5b656617b4463a9a fix: repair four test failures from post-migration audit Sonnet 4.6 patch 29 days ago
sha256:1900655993c83c4107067375548a7be823e471d2515830842f1a12cba4bd3cdf fix: unified object store migration — idempotent writes, JS… Sonnet 4.6 minor 29 days ago