gabriel / muse public
release_analysis.py python
429 lines 14.7 KB
Raw
sha256:1c4b3e3a9a1f300774c3ee662b572a698d5fd405bf765a71e6011a2e9c3eaaaa feat: Muse — version control for the agent era Human 74 days ago
1 """Semantic release analysis for the code domain.
2
3 Computes a :class:`~muse.core.store.SemanticReleaseReport` by interrogating
4 the content-addressed object store and the commit graph. Called at
5 ``muse release push`` time so that MuseHub receives a pre-computed report
6 alongside the release payload — the server never needs to run analysis itself.
7
8 Architecture note
9 -----------------
10 This module intentionally lives in ``muse.plugins.code`` (not ``muse.core``)
11 because the analysis is code-domain-specific: it depends on AST parsing and
12 language classification provided by the code plugin. ``muse.core`` remains
13 domain-agnostic; only the resulting :class:`SemanticReleaseReport` TypedDict
14 is stored there as a plain data container.
15 """
16
17 from __future__ import annotations
18
19 import logging
20 import pathlib
21
22 from muse.core.semver import (
23 ApiChangeSummary,
24 ChangelogEntry,
25 FileHotspot,
26 LanguageStat,
27 RefactorEventSummary,
28 SemanticReleaseReport,
29 SymbolKindCount,
30 )
31 from muse.core.store import (
32 Manifest,
33 ReleaseRecord,
34 read_snapshot,
35 walk_commits_between,
36 )
37
38 type SymbolTreeIndex = dict[str, "SymbolTree"]
39 type ApiSurface = dict[str, tuple[str, "SymbolRecord"]]
40 type CounterMap = dict[str, int]
41 from muse.domain import DomainOp
42 from muse.plugins.code._query import (
43 flat_symbol_ops,
44 is_semantic,
45 language_of,
46 symbols_for_snapshot,
47 touched_files,
48 )
49 from muse.plugins.code.ast_parser import SymbolRecord, SymbolTree
50
51 logger = logging.getLogger(__name__)
52
53 # Safety cap — skip symbol extraction on very large snapshots to keep push fast.
54 _MAX_SEMANTIC_FILES = 800
55
56
57 # ---------------------------------------------------------------------------
58 # Internal helpers
59 # ---------------------------------------------------------------------------
60
61
62 def _empty_report() -> SemanticReleaseReport:
63 return SemanticReleaseReport(
64 languages=[],
65 total_files=0,
66 semantic_files=0,
67 total_symbols=0,
68 symbols_by_kind=[],
69 files_changed=0,
70 api_added=[],
71 api_removed=[],
72 api_modified=[],
73 file_hotspots=[],
74 refactor_events=[],
75 breaking_changes=[],
76 human_commits=0,
77 agent_commits=0,
78 unique_agents=[],
79 unique_models=[],
80 reviewers=[],
81 )
82
83
84 def _is_public_symbol(name: str, kind: str) -> bool:
85 """Return True for symbols that are part of a public API surface.
86
87 Excludes dunder methods (except ``__init__`` and ``__call__``), private
88 names (single underscore prefix), and import/section symbols which are
89 structural rather than callable API.
90 """
91 if kind in ("import", "section", "rule"):
92 return False
93 if name.startswith("__") and name.endswith("__"):
94 return name in ("__init__", "__call__", "__new__")
95 return not name.startswith("_")
96
97
98 def _build_language_stats(
99 manifest: Manifest,
100 symbol_map: SymbolTreeIndex,
101 ) -> list[LanguageStat]:
102 """Aggregate per-language file and symbol counts from *manifest*."""
103 lang_files: CounterMap = {}
104 lang_symbols: CounterMap = {}
105
106 for file_path in manifest:
107 lang = language_of(file_path)
108 lang_files[lang] = lang_files.get(lang, 0) + 1
109
110 for file_path, tree in symbol_map.items():
111 lang = language_of(file_path)
112 lang_symbols[lang] = lang_symbols.get(lang, 0) + len(tree)
113
114 stats: list[LanguageStat] = [
115 LanguageStat(
116 language=lang,
117 files=lang_files[lang],
118 symbols=lang_symbols.get(lang, 0),
119 )
120 for lang in sorted(lang_files, key=lambda l: lang_files[l], reverse=True)
121 ]
122 return stats
123
124
125 def _build_symbol_kind_counts(symbol_map: SymbolTreeIndex) -> list[SymbolKindCount]:
126 """Count symbols by kind across all files in *symbol_map*."""
127 counts: CounterMap = {}
128 for tree in symbol_map.values():
129 for rec in tree.values():
130 kind = rec["kind"]
131 counts[kind] = counts.get(kind, 0) + 1
132 return [
133 SymbolKindCount(kind=k, count=counts[k])
134 for k in sorted(counts, key=lambda k: counts[k], reverse=True)
135 ]
136
137
138 def _api_surface(
139 root: pathlib.Path,
140 manifest: Manifest,
141 ) -> ApiSurface:
142 """Return a flat map of public-symbol address → (language, SymbolRecord)."""
143 surface: ApiSurface = {}
144 sym_map = symbols_for_snapshot(root, manifest)
145 for file_path, tree in sym_map.items():
146 lang = language_of(file_path)
147 for address, rec in tree.items():
148 if _is_public_symbol(rec["name"], rec["kind"]):
149 surface[address] = (lang, rec)
150 return surface
151
152
153 def _build_api_changes(
154 prev_surface: ApiSurface,
155 curr_surface: ApiSurface,
156 max_changes: int = 200,
157 ) -> tuple[list[ApiChangeSummary], list[ApiChangeSummary], list[ApiChangeSummary]]:
158 """Diff two API surfaces.
159
160 Returns ``(added, removed, modified)`` lists capped at *max_changes* each.
161 A symbol is "modified" when its ``signature_id`` changed (public contract
162 change) or its ``content_id`` changed but ``signature_id`` is the same
163 (implementation change also surfaced, but ranked lower).
164 """
165 added: list[ApiChangeSummary] = []
166 removed: list[ApiChangeSummary] = []
167 modified: list[ApiChangeSummary] = []
168
169 all_addresses = set(prev_surface) | set(curr_surface)
170 for address in sorted(all_addresses):
171 if address not in prev_surface:
172 lang, rec = curr_surface[address]
173 added.append(ApiChangeSummary(
174 address=address, language=lang, kind=rec["kind"], change="added",
175 ))
176 elif address not in curr_surface:
177 lang, rec = prev_surface[address]
178 removed.append(ApiChangeSummary(
179 address=address, language=lang, kind=rec["kind"], change="removed",
180 ))
181 else:
182 prev_rec = prev_surface[address][1]
183 curr_rec = curr_surface[address][1]
184 if prev_rec["content_id"] != curr_rec["content_id"]:
185 lang = curr_surface[address][0]
186 change = "modified"
187 modified.append(ApiChangeSummary(
188 address=address, language=lang, kind=curr_rec["kind"], change=change,
189 ))
190
191 return added[:max_changes], removed[:max_changes], modified[:max_changes]
192
193
194 def _is_patch_op(op: DomainOp) -> bool:
195 return op["op"] == "patch"
196
197
198 def _build_file_hotspots(
199 changelog: list[ChangelogEntry],
200 structured_deltas: list[list[DomainOp]],
201 max_hotspots: int = 10,
202 ) -> list[FileHotspot]:
203 """Count how many times each file was touched across this release's commits."""
204 churn: CounterMap = {}
205 for delta in structured_deltas:
206 for file_path in touched_files(delta):
207 churn[file_path] = churn.get(file_path, 0) + 1
208 # Also count non-patch top-level ops (whole-file add/delete).
209 for op in delta:
210 if not _is_patch_op(op):
211 addr = op["address"]
212 churn[addr] = churn.get(addr, 0) + 1
213
214 top = sorted(churn, key=lambda p: churn[p], reverse=True)[:max_hotspots]
215 return [
216 FileHotspot(
217 file_path=p,
218 change_count=churn[p],
219 language=language_of(p),
220 )
221 for p in top
222 ]
223
224
225 def _build_refactor_events(
226 changelog: list[ChangelogEntry],
227 structured_deltas: list[list[DomainOp]],
228 max_events: int = 50,
229 ) -> list[RefactorEventSummary]:
230 """Extract structural refactoring events from commit structured_deltas."""
231 events: list[RefactorEventSummary] = []
232 for entry, delta in zip(changelog, structured_deltas):
233 cid = entry["commit_id"][:8]
234 for op in delta:
235 if op["op"] == "patch":
236 if "from_address" in op:
237 events.append(RefactorEventSummary(
238 kind="move",
239 address=op["address"],
240 detail=f"moved from {op.get('from_address', '?')}",
241 commit_id=cid,
242 ))
243 elif op["op"] == "insert":
244 addr = op["address"]
245 if "/" in addr: # file-level insert = new file
246 events.append(RefactorEventSummary(
247 kind="add",
248 address=addr,
249 detail=op.get("content_summary", ""),
250 commit_id=cid,
251 ))
252 elif op["op"] == "delete":
253 addr = op["address"]
254 if "/" in addr: # file-level delete = removed file
255 events.append(RefactorEventSummary(
256 kind="delete",
257 address=addr,
258 detail=op.get("content_summary", ""),
259 commit_id=cid,
260 ))
261 # Symbol-level renames: same body_hash, different name
262 for sym_op in flat_symbol_ops(delta):
263 if sym_op["op"] == "insert":
264 events.append(RefactorEventSummary(
265 kind="add",
266 address=sym_op["address"],
267 detail=sym_op.get("content_summary", ""),
268 commit_id=cid,
269 ))
270 elif sym_op["op"] == "delete":
271 events.append(RefactorEventSummary(
272 kind="delete",
273 address=sym_op["address"],
274 detail=sym_op.get("content_summary", ""),
275 commit_id=cid,
276 ))
277 if len(events) >= max_events:
278 break
279 return events[:max_events]
280
281
282 # ---------------------------------------------------------------------------
283 # Public API
284 # ---------------------------------------------------------------------------
285
286
287 def compute_release_analysis(
288 root: pathlib.Path,
289 release: ReleaseRecord,
290 prev_snapshot_id: str | None = None,
291 ) -> SemanticReleaseReport:
292 """Compute the full semantic analysis for *release*.
293
294 Args:
295 root: Repository root (must contain ``.muse/``).
296 release: The release whose tip snapshot is being analysed.
297 prev_snapshot_id: Snapshot ID of the previous release. When provided,
298 the API surface diff is computed against it; otherwise
299 all public symbols are reported as "added".
300
301 Returns:
302 A fully populated :class:`~muse.core.store.SemanticReleaseReport`.
303 On any error the function returns an empty report rather than raising,
304 so a transient analysis failure never blocks a push.
305 """
306 try:
307 return _compute(root, release, prev_snapshot_id)
308 except Exception:
309 logger.warning(
310 "⚠️ Semantic analysis failed for release %s — attaching empty report.",
311 release.tag,
312 exc_info=True,
313 )
314 return _empty_report()
315
316
317 def _compute(
318 root: pathlib.Path,
319 release: ReleaseRecord,
320 prev_snapshot_id: str | None,
321 ) -> SemanticReleaseReport:
322 # -- Snapshot manifest for current release ---------------------------------
323 snap = read_snapshot(root, release.snapshot_id)
324 if snap is None:
325 logger.warning("⚠️ Snapshot %s not found; analysis skipped.", release.snapshot_id[:8])
326 return _empty_report()
327
328 manifest = snap.manifest
329 total_files = len(manifest)
330 semantic_file_count = sum(1 for p in manifest if is_semantic(p))
331
332 # Cap extraction to avoid blocking on huge snapshots.
333 capped_manifest = (
334 dict(list(manifest.items())[:_MAX_SEMANTIC_FILES])
335 if semantic_file_count > _MAX_SEMANTIC_FILES
336 else manifest
337 )
338
339 sym_map = symbols_for_snapshot(root, capped_manifest)
340 total_symbols = sum(len(tree) for tree in sym_map.values())
341 languages = _build_language_stats(manifest, sym_map)
342 symbols_by_kind = _build_symbol_kind_counts(sym_map)
343
344 # -- API surface diff ------------------------------------------------------
345 curr_surface = _api_surface(root, capped_manifest)
346 if prev_snapshot_id:
347 prev_snap = read_snapshot(root, prev_snapshot_id)
348 prev_manifest = prev_snap.manifest if prev_snap else {}
349 capped_prev = (
350 dict(list(prev_manifest.items())[:_MAX_SEMANTIC_FILES])
351 if len(prev_manifest) > _MAX_SEMANTIC_FILES
352 else prev_manifest
353 )
354 prev_surface = _api_surface(root, capped_prev)
355 else:
356 prev_surface = {}
357
358 api_added, api_removed, api_modified = _build_api_changes(prev_surface, curr_surface)
359
360 # -- Commit-level analysis from structured_deltas ---------------------------
361 repo_id = release.repo_id
362 changelog = release.changelog
363 commits = walk_commits_between(root, release.commit_id, None, max_commits=500)
364 # Align commits to changelog (newest-first from walk, oldest-first in changelog).
365 commit_map = {c.commit_id: c for c in commits}
366 structured_deltas: list[list[DomainOp]] = []
367 for entry in changelog:
368 commit = commit_map.get(entry["commit_id"])
369 if commit and commit.structured_delta:
370 structured_deltas.append(commit.structured_delta["ops"])
371 else:
372 structured_deltas.append([])
373
374 files_changed = len({
375 p
376 for delta in structured_deltas
377 for p in touched_files(delta)
378 })
379
380 file_hotspots = _build_file_hotspots(changelog, structured_deltas)
381 refactor_events = _build_refactor_events(changelog, structured_deltas)
382
383 # -- Provenance aggregation ------------------------------------------------
384 breaking: list[str] = []
385 seen_bc: set[str] = set()
386 human_commits = 0
387 agent_commits = 0
388 agents: set[str] = set()
389 models: set[str] = set()
390 reviewers_set: set[str] = set()
391
392 for entry in changelog:
393 for bc in entry.get("breaking_changes", []):
394 if bc not in seen_bc:
395 seen_bc.add(bc)
396 breaking.append(bc)
397 aid = entry.get("agent_id", "")
398 mid = entry.get("model_id", "")
399 if aid:
400 agent_commits += 1
401 agents.add(aid)
402 else:
403 human_commits += 1
404 if mid:
405 models.add(mid)
406
407 for c in commits:
408 for reviewer in c.reviewed_by:
409 reviewers_set.add(reviewer)
410
411 return SemanticReleaseReport(
412 languages=languages,
413 total_files=total_files,
414 semantic_files=semantic_file_count,
415 total_symbols=total_symbols,
416 symbols_by_kind=symbols_by_kind,
417 files_changed=files_changed,
418 api_added=api_added,
419 api_removed=api_removed,
420 api_modified=api_modified,
421 file_hotspots=file_hotspots,
422 refactor_events=refactor_events,
423 breaking_changes=breaking,
424 human_commits=human_commits,
425 agent_commits=agent_commits,
426 unique_agents=sorted(agents),
427 unique_models=sorted(models),
428 reviewers=sorted(reviewers_set),
429 )
File History 1 commit
sha256:1c4b3e3a9a1f300774c3ee662b572a698d5fd405bf765a71e6011a2e9c3eaaaa feat: Muse — version control for the agent era Human 74 days ago