gabriel / muse public
release_analysis.py python
412 lines 14.7 KB
Raw
sha256:b6cae4448122b2cc690d913be26f7e0a539f11855b8d288bd48be43eb532b5b2 refactor: migrate all source callers off muse.core.store re… Sonnet 4.6 minor ⚠ breaking 29 days ago
1 """Semantic release analysis for the code domain.
2
3 Computes a :class:`~muse.core.store.SemanticReleaseReport` by interrogating
4 the content-addressed object store and the commit graph. Called at
5 ``muse release push`` time so that MuseHub receives a pre-computed report
6 alongside the release payload — the server never needs to run analysis itself.
7
8 Architecture note
9 -----------------
10 This module intentionally lives in ``muse.plugins.code`` (not ``muse.core``)
11 because the analysis is code-domain-specific: it depends on AST parsing and
12 language classification provided by the code plugin. ``muse.core`` remains
13 domain-agnostic; only the resulting :class:`SemanticReleaseReport` TypedDict
14 is stored there as a plain data container.
15 """
16
17 import logging
18 import pathlib
19
20 from muse.core.semver import (
21 ApiChangeSummary,
22 ChangelogEntry,
23 FileHotspot,
24 LanguageStat,
25 RefactorEventSummary,
26 SemanticReleaseReport,
27 SymbolKindCount,
28 )
29 from muse.core.types import Manifest
30 from muse.core.commits import walk_commits_between
31 from muse.core.snapshots import read_snapshot
32 from muse.core.releases import ReleaseRecord
33
34 type SymbolTreeIndex = dict[str, "SymbolTree"]
35 type ApiSurface = dict[str, tuple[str, "SymbolRecord"]]
36 type CounterMap = dict[str, int]
37 from muse.domain import DomainOp
38 from muse.plugins.code._query import (
39 flat_symbol_ops,
40 is_semantic,
41 language_of,
42 symbols_for_snapshot,
43 touched_files,
44 )
45 from muse.plugins.code.ast_parser import SymbolRecord, SymbolTree
46
47 logger = logging.getLogger(__name__)
48
49 # Safety cap — skip symbol extraction on very large snapshots to keep push fast.
50 _MAX_SEMANTIC_FILES = 800
51
52 # ---------------------------------------------------------------------------
53 # Internal helpers
54 # ---------------------------------------------------------------------------
55
56 def _empty_report() -> SemanticReleaseReport:
57 return SemanticReleaseReport(
58 languages=[],
59 total_files=0,
60 semantic_files=0,
61 total_symbols=0,
62 symbols_by_kind=[],
63 files_changed=0,
64 api_added=[],
65 api_removed=[],
66 api_modified=[],
67 file_hotspots=[],
68 refactor_events=[],
69 breaking_changes=[],
70 human_commits=0,
71 agent_commits=0,
72 unique_agents=[],
73 unique_models=[],
74 reviewers=[],
75 )
76
77 def _is_public_symbol(name: str, kind: str) -> bool:
78 """Return True for symbols that are part of a public API surface.
79
80 Excludes dunder methods (except ``__init__`` and ``__call__``), private
81 names (single underscore prefix), and import/section symbols which are
82 structural rather than callable API.
83 """
84 if kind in ("import", "section", "rule"):
85 return False
86 if name.startswith("__") and name.endswith("__"):
87 return name in ("__init__", "__call__", "__new__")
88 return not name.startswith("_")
89
90 def _build_language_stats(
91 manifest: Manifest,
92 symbol_map: SymbolTreeIndex,
93 ) -> list[LanguageStat]:
94 """Aggregate per-language file and symbol counts from *manifest*."""
95 lang_files: CounterMap = {}
96 lang_symbols: CounterMap = {}
97
98 for file_path in manifest:
99 lang = language_of(file_path)
100 lang_files[lang] = lang_files.get(lang, 0) + 1
101
102 for file_path, tree in symbol_map.items():
103 lang = language_of(file_path)
104 lang_symbols[lang] = lang_symbols.get(lang, 0) + len(tree)
105
106 stats: list[LanguageStat] = [
107 LanguageStat(
108 language=lang,
109 files=lang_files[lang],
110 symbols=lang_symbols.get(lang, 0),
111 )
112 for lang in sorted(lang_files, key=lambda l: lang_files[l], reverse=True)
113 ]
114 return stats
115
116 def _build_symbol_kind_counts(symbol_map: SymbolTreeIndex) -> list[SymbolKindCount]:
117 """Count symbols by kind across all files in *symbol_map*."""
118 counts: CounterMap = {}
119 for tree in symbol_map.values():
120 for rec in tree.values():
121 kind = rec["kind"]
122 counts[kind] = counts.get(kind, 0) + 1
123 return [
124 SymbolKindCount(kind=k, count=counts[k])
125 for k in sorted(counts, key=lambda k: counts[k], reverse=True)
126 ]
127
128 def _api_surface(
129 root: pathlib.Path,
130 manifest: Manifest,
131 ) -> ApiSurface:
132 """Return a flat map of public-symbol address → (language, SymbolRecord)."""
133 surface: ApiSurface = {}
134 sym_map = symbols_for_snapshot(root, manifest)
135 for file_path, tree in sym_map.items():
136 lang = language_of(file_path)
137 for address, rec in tree.items():
138 if _is_public_symbol(rec["name"], rec["kind"]):
139 surface[address] = (lang, rec)
140 return surface
141
142 def _build_api_changes(
143 prev_surface: ApiSurface,
144 curr_surface: ApiSurface,
145 max_changes: int = 200,
146 ) -> tuple[list[ApiChangeSummary], list[ApiChangeSummary], list[ApiChangeSummary]]:
147 """Diff two API surfaces.
148
149 Returns ``(added, removed, modified)`` lists capped at *max_changes* each.
150 A symbol is "modified" when its ``signature_id`` changed (public contract
151 change) or its ``content_id`` changed but ``signature_id`` is the same
152 (implementation change also surfaced, but ranked lower).
153 """
154 added: list[ApiChangeSummary] = []
155 removed: list[ApiChangeSummary] = []
156 modified: list[ApiChangeSummary] = []
157
158 all_addresses = set(prev_surface) | set(curr_surface)
159 for address in sorted(all_addresses):
160 if address not in prev_surface:
161 lang, rec = curr_surface[address]
162 added.append(ApiChangeSummary(
163 address=address, language=lang, kind=rec["kind"], change="added",
164 ))
165 elif address not in curr_surface:
166 lang, rec = prev_surface[address]
167 removed.append(ApiChangeSummary(
168 address=address, language=lang, kind=rec["kind"], change="removed",
169 ))
170 else:
171 prev_rec = prev_surface[address][1]
172 curr_rec = curr_surface[address][1]
173 if prev_rec["content_id"] != curr_rec["content_id"]:
174 lang = curr_surface[address][0]
175 change = "modified"
176 modified.append(ApiChangeSummary(
177 address=address, language=lang, kind=curr_rec["kind"], change=change,
178 ))
179
180 return added[:max_changes], removed[:max_changes], modified[:max_changes]
181
182 def _is_patch_op(op: DomainOp) -> bool:
183 return op["op"] == "patch"
184
185 def _build_file_hotspots(
186 changelog: list[ChangelogEntry],
187 structured_deltas: list[list[DomainOp]],
188 max_hotspots: int = 10,
189 ) -> list[FileHotspot]:
190 """Count how many times each file was touched across this release's commits."""
191 churn: CounterMap = {}
192 for delta in structured_deltas:
193 for file_path in touched_files(delta):
194 churn[file_path] = churn.get(file_path, 0) + 1
195 # Also count non-patch top-level ops (whole-file add/delete).
196 for op in delta:
197 if not _is_patch_op(op):
198 addr = op["address"]
199 churn[addr] = churn.get(addr, 0) + 1
200
201 top = sorted(churn, key=lambda p: churn[p], reverse=True)[:max_hotspots]
202 return [
203 FileHotspot(
204 file_path=p,
205 change_count=churn[p],
206 language=language_of(p),
207 )
208 for p in top
209 ]
210
211 def _build_refactor_events(
212 changelog: list[ChangelogEntry],
213 structured_deltas: list[list[DomainOp]],
214 max_events: int = 50,
215 ) -> list[RefactorEventSummary]:
216 """Extract structural refactoring events from commit structured_deltas."""
217 events: list[RefactorEventSummary] = []
218 for entry, delta in zip(changelog, structured_deltas):
219 cid = entry["commit_id"]
220 for op in delta:
221 if op["op"] == "patch":
222 if "from_address" in op:
223 events.append(RefactorEventSummary(
224 kind="move",
225 address=op["address"],
226 detail=f"moved from {op.get('from_address', '?')}",
227 commit_id=cid,
228 ))
229 elif op["op"] == "insert":
230 addr = op["address"]
231 if "/" in addr: # file-level insert = new file
232 events.append(RefactorEventSummary(
233 kind="insert",
234 address=addr,
235 detail=op.get("content_summary", ""),
236 commit_id=cid,
237 ))
238 elif op["op"] == "delete":
239 addr = op["address"]
240 if "/" in addr: # file-level delete = removed file
241 events.append(RefactorEventSummary(
242 kind="delete",
243 address=addr,
244 detail=op.get("content_summary", ""),
245 commit_id=cid,
246 ))
247 # Symbol-level renames: same body_hash, different name
248 for sym_op in flat_symbol_ops(delta):
249 if sym_op["op"] == "insert":
250 events.append(RefactorEventSummary(
251 kind="insert",
252 address=sym_op["address"],
253 detail=sym_op.get("content_summary", ""),
254 commit_id=cid,
255 ))
256 elif sym_op["op"] == "delete":
257 events.append(RefactorEventSummary(
258 kind="delete",
259 address=sym_op["address"],
260 detail=sym_op.get("content_summary", ""),
261 commit_id=cid,
262 ))
263 if len(events) >= max_events:
264 break
265 return events[:max_events]
266
267 # ---------------------------------------------------------------------------
268 # Public API
269 # ---------------------------------------------------------------------------
270
271 def compute_release_analysis(
272 root: pathlib.Path,
273 release: ReleaseRecord,
274 prev_snapshot_id: str | None = None,
275 ) -> SemanticReleaseReport:
276 """Compute the full semantic analysis for *release*.
277
278 Args:
279 root: Repository root (must contain ``.muse/``).
280 release: The release whose tip snapshot is being analysed.
281 prev_snapshot_id: Snapshot ID of the previous release. When provided,
282 the API surface diff is computed against it; otherwise
283 all public symbols are reported as "added".
284
285 Returns:
286 A fully populated :class:`~muse.core.store.SemanticReleaseReport`.
287 On any error the function returns an empty report rather than raising,
288 so a transient analysis failure never blocks a push.
289 """
290 try:
291 return _compute(root, release, prev_snapshot_id)
292 except Exception:
293 logger.warning(
294 "⚠️ Semantic analysis failed for release %s — attaching empty report.",
295 release.tag,
296 exc_info=True,
297 )
298 return _empty_report()
299
300 def _compute(
301 root: pathlib.Path,
302 release: ReleaseRecord,
303 prev_snapshot_id: str | None,
304 ) -> SemanticReleaseReport:
305 # -- Snapshot manifest for current release ---------------------------------
306 snap = read_snapshot(root, release.snapshot_id)
307 if snap is None:
308 logger.warning("⚠️ Snapshot %s not found; analysis skipped.", release.snapshot_id)
309 return _empty_report()
310
311 manifest = snap.manifest
312 total_files = len(manifest)
313 semantic_file_count = sum(1 for p in manifest if is_semantic(p))
314
315 # Cap extraction to avoid blocking on huge snapshots.
316 capped_manifest = (
317 dict(list(manifest.items())[:_MAX_SEMANTIC_FILES])
318 if semantic_file_count > _MAX_SEMANTIC_FILES
319 else manifest
320 )
321
322 sym_map = symbols_for_snapshot(root, capped_manifest)
323 total_symbols = sum(len(tree) for tree in sym_map.values())
324 languages = _build_language_stats(manifest, sym_map)
325 symbols_by_kind = _build_symbol_kind_counts(sym_map)
326
327 # -- API surface diff ------------------------------------------------------
328 curr_surface = _api_surface(root, capped_manifest)
329 if prev_snapshot_id:
330 prev_snap = read_snapshot(root, prev_snapshot_id)
331 prev_manifest = prev_snap.manifest if prev_snap else {}
332 capped_prev = (
333 dict(list(prev_manifest.items())[:_MAX_SEMANTIC_FILES])
334 if len(prev_manifest) > _MAX_SEMANTIC_FILES
335 else prev_manifest
336 )
337 prev_surface = _api_surface(root, capped_prev)
338 else:
339 prev_surface = {}
340
341 api_added, api_removed, api_modified = _build_api_changes(prev_surface, curr_surface)
342
343 # -- Commit-level analysis from structured_deltas ---------------------------
344 repo_id = release.repo_id
345 changelog = release.changelog
346 commits = walk_commits_between(root, release.commit_id, None, max_commits=500)
347 # Align commits to changelog (newest-first from walk, oldest-first in changelog).
348 commit_map = {c.commit_id: c for c in commits}
349 structured_deltas: list[list[DomainOp]] = []
350 for entry in changelog:
351 commit = commit_map.get(entry["commit_id"])
352 if commit and commit.structured_delta:
353 structured_deltas.append(commit.structured_delta["ops"])
354 else:
355 structured_deltas.append([])
356
357 files_changed = len({
358 p
359 for delta in structured_deltas
360 for p in touched_files(delta)
361 })
362
363 file_hotspots = _build_file_hotspots(changelog, structured_deltas)
364 refactor_events = _build_refactor_events(changelog, structured_deltas)
365
366 # -- Provenance aggregation ------------------------------------------------
367 breaking: list[str] = []
368 seen_bc: set[str] = set()
369 human_commits = 0
370 agent_commits = 0
371 agents: set[str] = set()
372 models: set[str] = set()
373 reviewers_set: set[str] = set()
374
375 for entry in changelog:
376 for bc in entry.get("breaking_changes", []):
377 if bc not in seen_bc:
378 seen_bc.add(bc)
379 breaking.append(bc)
380 aid = entry.get("agent_id", "")
381 mid = entry.get("model_id", "")
382 if aid:
383 agent_commits += 1
384 agents.add(aid)
385 else:
386 human_commits += 1
387 if mid:
388 models.add(mid)
389
390 for c in commits:
391 for reviewer in c.reviewed_by:
392 reviewers_set.add(reviewer)
393
394 return SemanticReleaseReport(
395 languages=languages,
396 total_files=total_files,
397 semantic_files=semantic_file_count,
398 total_symbols=total_symbols,
399 symbols_by_kind=symbols_by_kind,
400 files_changed=files_changed,
401 api_added=api_added,
402 api_removed=api_removed,
403 api_modified=api_modified,
404 file_hotspots=file_hotspots,
405 refactor_events=refactor_events,
406 breaking_changes=breaking,
407 human_commits=human_commits,
408 agent_commits=agent_commits,
409 unique_agents=sorted(agents),
410 unique_models=sorted(models),
411 reviewers=sorted(reviewers_set),
412 )
File History 1 commit
sha256:b6cae4448122b2cc690d913be26f7e0a539f11855b8d288bd48be43eb532b5b2 refactor: migrate all source callers off muse.core.store re… Sonnet 4.6 minor 29 days ago