gabriel / muse public
clones.py python
499 lines 17.8 KB
Raw
1 """muse code clones — find duplicate and near-duplicate symbols.
2
3 Detects two tiers of code duplication from committed snapshot data:
4
5 **Exact clones**
6 Symbols with the same ``body_hash`` at different addresses. The body is
7 character-for-character identical (after normalisation) even if the name or
8 surrounding context differs. These are true copy-paste duplicates.
9
10 **Near-clones**
11 Symbols with the same ``signature_id`` but different ``body_hash``. Same
12 function signature, different implementation — strong candidates for
13 consolidation behind a shared abstraction.
14
15 Git has no concept of these. Git stores file diffs; Muse stores symbol
16 identity hashes. Clone detection is a single pass over the snapshot index.
17
18 Usage::
19
20 muse code clones
21 muse code clones --tier exact
22 muse code clones --tier near
23 muse code clones --kind function
24 muse code clones --language Python
25 muse code clones --file muse/core/
26 muse code clones --exclude-same-file
27 muse code clones --commit HEAD~10
28 muse code clones --min-cluster 3
29 muse code clones --json
30
31 Output::
32
33 Clone analysis — commit a1b2c3d4
34
35 Exact clones (2 clusters):
36 body_hash a1b2c3d4:
37 src/billing.py::compute_hash function
38 src/utils.py::compute_hash function
39 src/legacy.py::_hash function
40
41 Near-clones — same signature (3 clusters):
42 signature_id e5f6a7b8:
43 src/billing.py::validate function
44 src/auth.py::validate function
45
46 Files with most clone members:
47 src/billing.py 4 clone symbols
48
49 Flags:
50
51 ``--tier {exact|near|both}``
52 Which tier to report (default: both).
53
54 ``--kind KIND``
55 Restrict to symbols of this kind (function, class, method, …).
56
57 ``--language LANG``
58 Restrict to files of this language.
59
60 ``--file PATH``
61 Restrict to symbols whose file path starts with PATH.
62
63 ``--exclude-same-file``
64 Skip clusters where every member lives in the same file.
65
66 ``--min-cluster N``
67 Only show clusters with at least N members (default: 2).
68
69 ``--commit, -c REF``
70 Analyse a historical snapshot instead of HEAD.
71
72 ``--json``
73 Emit results as JSON.
74 """
75
76 import argparse
77 import json
78 import logging
79 import pathlib
80 from typing import Literal, TypedDict
81
82 from muse.core.errors import ExitCode
83 from muse.core.envelope import EnvelopeJson, make_envelope
84 from muse.core.indices import HashOccurrenceIndex, load_hash_occurrence
85 from muse.core.repo import require_repo
86 from muse.core.types import Manifest
87 from muse.core.refs import read_current_branch
88 from muse.core.commits import resolve_commit_ref
89 from muse.core.snapshots import get_commit_snapshot_manifest
90 from muse.core.symbol_cache import SymbolCache, load_symbol_cache
91 from muse.core.timing import start_timer
92 from muse.plugins.code._query import language_of, symbols_for_snapshot
93 from muse.plugins.code.ast_parser import SymbolRecord
94 from muse.core.validation import clamp_int, sanitize_display
95
96 type _SymIndex = dict[str, list[tuple[str, SymbolRecord]]]
97 type _FileCountMap = dict[str, int]
98
99 logger = logging.getLogger(__name__)
100
101 CloneTier = Literal["exact", "near", "both"]
102
103 # ---------------------------------------------------------------------------
104 # Typed output shapes
105 # ---------------------------------------------------------------------------
106
107 class _MemberDict(TypedDict):
108 address: str
109 kind: str
110 language: str
111 body_hash: str
112 signature_id: str
113 content_id: str
114
115 class _ClusterDict(TypedDict):
116 tier: str
117 hash: str
118 count: int
119 members: list[_MemberDict]
120
121 class _FileHotspot(TypedDict):
122 file: str
123 clone_symbols: int
124
125 class _ClonesOutputJson(EnvelopeJson):
126 """Top-level JSON emitted by ``muse code clones --json``.
127
128 Fields
129 ------
130 commit Short commit ID that was analysed.
131 branch Current branch name.
132 tier Which tier was requested: "exact", "near", or "both".
133 min_cluster Minimum cluster size threshold used.
134 kind_filter Symbol-kind filter, or None.
135 language_filter Language filter, or None.
136 file_filter File-prefix filter, or None.
137 exclude_same_file Whether same-file clusters were suppressed.
138 exact_clone_clusters Number of exact-clone clusters found.
139 near_clone_clusters Number of near-clone clusters found.
140 total_symbols_involved Total symbols across all clusters.
141 file_hotspots Ranked list of files by clone-member count.
142 clusters Full cluster data as :class:`_ClusterDict` entries.
143 """
144
145 commit: str
146 branch: str
147 tier: str
148 min_cluster: int
149 kind_filter: str | None
150 language_filter: str | None
151 file_filter: str | None
152 exclude_same_file: bool
153 exact_clone_clusters: int
154 near_clone_clusters: int
155 total_symbols_involved: int
156 file_hotspots: list[_FileHotspot]
157 clusters: list[_ClusterDict]
158
159 # ---------------------------------------------------------------------------
160 # Core data model
161 # ---------------------------------------------------------------------------
162
163 class _CloneCluster:
164 """A group of symbols that are duplicates of each other."""
165
166 def __init__(
167 self,
168 tier: CloneTier,
169 hash_value: str,
170 members: list[tuple[str, SymbolRecord]],
171 ) -> None:
172 self.tier = tier
173 self.hash_value = hash_value
174 self.members = members # (address, record)
175
176 def to_dict(self) -> _ClusterDict:
177 return {
178 "tier": self.tier,
179 "hash": self.hash_value,
180 "count": len(self.members),
181 "members": [
182 _MemberDict(
183 address=addr,
184 kind=rec["kind"],
185 language=language_of(addr.split("::")[0]),
186 body_hash=rec["body_hash"],
187 signature_id=rec["signature_id"],
188 content_id=rec["content_id"],
189 )
190 for addr, rec in self.members
191 ],
192 }
193
194 # ---------------------------------------------------------------------------
195 # Detection logic
196 # ---------------------------------------------------------------------------
197
198 def find_clones(
199 root: pathlib.Path,
200 manifest: Manifest,
201 tier: CloneTier,
202 kind_filter: str | None,
203 min_cluster: int,
204 *,
205 language_filter: str | None = None,
206 file_filter: str | None = None,
207 exclude_same_file: bool = False,
208 cache: SymbolCache | None = None,
209 ) -> list[_CloneCluster]:
210 """Build clone clusters from *manifest*.
211
212 Args:
213 root: Repository root (object store location).
214 manifest: Snapshot manifest: file path → SHA-256 object ID.
215 tier: Which clone tier(s) to detect: "exact", "near", "both".
216 kind_filter: If set, only analyse symbols of this kind.
217 min_cluster: Minimum cluster size to report (default 2).
218 language_filter: If set, only analyse files of this language.
219 file_filter: If set, only analyse symbols whose file path starts
220 with this prefix (e.g. "muse/core/").
221 exclude_same_file: If True, skip clusters where all members are in
222 the same file (eliminates test-helper noise).
223 cache: Optional shared ``SymbolCache``. Pass one to avoid
224 re-parsing blobs when the caller has a warm cache.
225 """
226 sym_map = symbols_for_snapshot(
227 root,
228 manifest,
229 kind_filter=kind_filter,
230 language_filter=language_filter,
231 cache=cache,
232 )
233
234 # Flatten to list of (address, record), applying file_filter.
235 all_syms: list[tuple[str, SymbolRecord]] = [
236 (addr, rec)
237 for fp, tree in sorted(sym_map.items())
238 if file_filter is None or fp.startswith(file_filter)
239 for addr, rec in sorted(tree.items())
240 if rec["kind"] != "import"
241 ]
242
243 clusters: list[_CloneCluster] = []
244
245 if tier in ("exact", "both"):
246 body_index: _SymIndex = {}
247 for addr, rec in all_syms:
248 body_index.setdefault(rec["body_hash"], []).append((addr, rec))
249 for body_hash, members in sorted(body_index.items()):
250 if len(members) < min_cluster:
251 continue
252 if exclude_same_file and _all_same_file(members):
253 continue
254 clusters.append(_CloneCluster("exact", body_hash, members))
255
256 if tier in ("near", "both"):
257 sig_index: _SymIndex = {}
258 for addr, rec in all_syms:
259 sig_index.setdefault(rec["signature_id"], []).append((addr, rec))
260 for sig_id, members in sorted(sig_index.items()):
261 # Near-clone: same signature, at least two DIFFERENT body hashes.
262 unique_bodies = {r["body_hash"] for _, r in members}
263 if len(members) < min_cluster or len(unique_bodies) <= 1:
264 continue
265 if exclude_same_file and _all_same_file(members):
266 continue
267 clusters.append(_CloneCluster("near", sig_id, members))
268
269 # Sort: largest clusters first, then by tier (exact before near), then hash.
270 clusters.sort(key=lambda c: (-len(c.members), c.tier, c.hash_value))
271 return clusters
272
273 def _candidate_files_from_index(
274 index: HashOccurrenceIndex,
275 min_cluster: int,
276 file_filter: str | None,
277 ) -> set[str]:
278 """Return file paths that appear in at least one clone cluster in *index*.
279
280 Applies *file_filter* and *min_cluster* so callers only parse files that
281 will actually contribute to the output.
282 """
283 files: set[str] = set()
284 for _body_hash, addresses in index.items():
285 filtered = [
286 a for a in addresses
287 if file_filter is None or a.split("::")[0].startswith(file_filter)
288 ]
289 if len(filtered) >= min_cluster:
290 for addr in filtered:
291 files.add(addr.split("::")[0])
292 return files
293
294 def _all_same_file(members: list[tuple[str, SymbolRecord]]) -> bool:
295 """Return True when every member lives in the same source file."""
296 files = {addr.split("::")[0] for addr, _ in members}
297 return len(files) == 1
298
299 def _file_hotspots(
300 clusters: list[_CloneCluster],
301 top: int = 10,
302 ) -> list[_FileHotspot]:
303 """Rank files by the number of clone-member symbols they contain."""
304 file_counts: _FileCountMap = {}
305 for cluster in clusters:
306 for addr, _ in cluster.members:
307 fp = addr.split("::")[0]
308 file_counts[fp] = file_counts.get(fp, 0) + 1
309 ranked = sorted(file_counts.items(), key=lambda t: t[1], reverse=True)[:top]
310 return [_FileHotspot(file=fp, clone_symbols=cnt) for fp, cnt in ranked]
311
312 # ---------------------------------------------------------------------------
313 # CLI registration and entry point
314 # ---------------------------------------------------------------------------
315
316 def register(subparsers: "argparse._SubParsersAction[argparse.ArgumentParser]") -> None:
317 """Register the clones subcommand."""
318 parser = subparsers.add_parser(
319 "clones",
320 help="Find exact and near-duplicate symbols in the committed snapshot.",
321 description=__doc__,
322 formatter_class=argparse.RawDescriptionHelpFormatter,
323 )
324 parser.add_argument(
325 "--tier", "-t", default="both", choices=["exact", "near", "both"],
326 help="Tier to report: exact, near, or both (default: both).",
327 )
328 parser.add_argument(
329 "--kind", "-k", default=None, metavar="KIND", dest="kind_filter",
330 help="Restrict to symbols of this kind (function, class, method, …).",
331 )
332 parser.add_argument(
333 "--language", "-l", default=None, metavar="LANG", dest="language_filter",
334 help="Restrict to files of this language.",
335 )
336 parser.add_argument(
337 "--file", "-f", default=None, metavar="PATH", dest="file_filter",
338 help="Restrict to symbols whose file path starts with PATH.",
339 )
340 parser.add_argument(
341 "--exclude-same-file", action="store_true", dest="exclude_same_file",
342 help="Skip clusters where all members are in the same file.",
343 )
344 parser.add_argument(
345 "--min-cluster", "-m", type=int, default=2, metavar="N", dest="min_cluster",
346 help="Only show clusters with at least N members (default: 2).",
347 )
348 parser.add_argument(
349 "--commit", "-c", default=None, metavar="REF", dest="ref",
350 help="Analyse this commit instead of HEAD.",
351 )
352 parser.add_argument(
353 "--json", "-j", action="store_true", dest="json_out",
354 help="Emit results as JSON.",
355 )
356 parser.set_defaults(func=run)
357
358 def run(args: argparse.Namespace) -> None:
359 """Find exact and near-duplicate symbols in the committed snapshot.
360
361 Exact clones share the same ``body_hash`` (identical implementation).
362 Near-clones share the same ``signature_id`` but differ in body — same
363 contract, different implementation. Both are candidates for extraction
364 behind a shared abstraction. Uses content-addressed hashes from the
365 snapshot index — no AST recomputation at query time.
366
367 Agent quickstart
368 ----------------
369 ::
370
371 muse code clones --json
372 muse code clones --tier exact --json
373 muse code clones --kind function --language Python --json
374
375 JSON fields
376 -----------
377 ref Commit ref analysed.
378 tier Clone tier: ``"exact"``, ``"near"``, or ``"all"``.
379 clusters List of clone cluster objects, each with: ``tier``
380 (``"exact"`` or ``"near"``), ``cluster_id`` (shared hash),
381 ``count`` (number of symbols), ``symbols`` (list of addresses).
382 total_clusters Total number of clusters found.
383 total_symbols Total number of symbols across all clusters.
384
385 Exit codes
386 ----------
387 0 Analysis complete.
388 1 Invalid arguments or commit not found.
389 2 Not inside a Muse repository.
390 """
391 elapsed = start_timer()
392 tier: CloneTier = args.tier
393 kind_filter: str | None = args.kind_filter
394 language_filter: str | None = args.language_filter
395 file_filter: str | None = args.file_filter
396 exclude_same_file: bool = args.exclude_same_file
397 min_cluster: int = clamp_int(args.min_cluster, 1, 10000, 'min_cluster')
398 ref: str | None = args.ref
399 json_out: bool = args.json_out
400
401 if min_cluster < 2:
402 logger.error("--min-cluster must be at least 2, got %d", min_cluster)
403 raise SystemExit(ExitCode.USER_ERROR)
404
405 root = require_repo()
406
407 branch = read_current_branch(root)
408 commit = resolve_commit_ref(root, branch, ref)
409 if commit is None:
410 logger.error("Commit %r not found.", ref or "HEAD")
411 raise SystemExit(ExitCode.USER_ERROR)
412
413 manifest = get_commit_snapshot_manifest(root, commit.commit_id) or {}
414
415 # Index fast path: when analysing HEAD with --tier exact, restrict the
416 # manifest to files that appear in the prebuilt hash_occurrence index.
417 # This skips parsing files with no clone candidates — O(K) instead of O(N).
418 # Falls back to full manifest when the index is absent or empty.
419 if ref is None and tier == "exact":
420 ho_index = load_hash_occurrence(root)
421 if ho_index:
422 candidate_files = _candidate_files_from_index(ho_index, min_cluster, file_filter)
423 if candidate_files:
424 manifest = {fp: oid for fp, oid in manifest.items() if fp in candidate_files}
425
426 # Shared cache — each blob parsed at most once.
427 cache = load_symbol_cache(root)
428
429 cluster_list = find_clones(
430 root,
431 manifest,
432 tier,
433 kind_filter,
434 min_cluster,
435 language_filter=language_filter,
436 file_filter=file_filter,
437 exclude_same_file=exclude_same_file,
438 cache=cache,
439 )
440
441 exact_clusters = [c for c in cluster_list if c.tier == "exact"]
442 near_clusters = [c for c in cluster_list if c.tier == "near"]
443 total_symbols = sum(len(c.members) for c in cluster_list)
444 hotspots = _file_hotspots(cluster_list)
445
446 if json_out:
447 print(json.dumps(_ClonesOutputJson(
448 **make_envelope(elapsed),
449 commit=commit.commit_id,
450 branch=branch,
451 tier=tier,
452 min_cluster=min_cluster,
453 kind_filter=kind_filter,
454 language_filter=language_filter,
455 file_filter=file_filter,
456 exclude_same_file=exclude_same_file,
457 exact_clone_clusters=len(exact_clusters),
458 near_clone_clusters=len(near_clusters),
459 total_symbols_involved=total_symbols,
460 file_hotspots=hotspots,
461 clusters=[c.to_dict() for c in cluster_list],
462 )))
463 return
464
465 print(f"\nClone analysis — commit {commit.commit_id}")
466 if kind_filter:
467 print(f" (kind: {kind_filter})")
468 if language_filter:
469 print(f" (language: {language_filter})")
470 if file_filter:
471 print(f" (file prefix: {file_filter})")
472 if exclude_same_file:
473 print(" (same-file clusters excluded)")
474 print("─" * 62)
475
476 if not cluster_list:
477 print("\n ✅ No clones detected.")
478 return
479
480 if exact_clusters and tier in ("exact", "both"):
481 print(f"\nExact clones ({len(exact_clusters)} cluster(s)):")
482 for cl in exact_clusters:
483 print(f" body_hash {cl.hash_value}:")
484 for addr, rec in cl.members:
485 print(f" {sanitize_display(addr)} {rec['kind']}")
486
487 if near_clusters and tier in ("near", "both"):
488 print(f"\nNear-clones — same signature ({len(near_clusters)} cluster(s)):")
489 for cl in near_clusters:
490 print(f" signature_id {cl.hash_value}:")
491 for addr, rec in cl.members:
492 print(f" {sanitize_display(addr)} {rec['kind']} (body {rec['body_hash']})")
493
494 print(f"\n {len(cluster_list)} clone cluster(s), {total_symbols} total symbol(s) involved")
495
496 if hotspots:
497 print(f"\nFiles with most clone members:")
498 for h in hotspots[:5]:
499 print(f" {sanitize_display(h['file'])} {h['clone_symbols']} clone symbol(s)")
File History 1 commit