clones.py
python
sha256:d11a87833d5fad6059b7662844bf5448a8911a17cce7a51811f71ad394f248eb
bump to v0.2.0rc13
Human
patch
6 days ago
| 1 | """muse code clones — find duplicate and near-duplicate symbols. |
| 2 | |
| 3 | Detects two tiers of code duplication from committed snapshot data: |
| 4 | |
| 5 | **Exact clones** |
| 6 | Symbols with the same ``body_hash`` at different addresses. The body is |
| 7 | character-for-character identical (after normalisation) even if the name or |
| 8 | surrounding context differs. These are true copy-paste duplicates. |
| 9 | |
| 10 | **Near-clones** |
| 11 | Symbols with the same ``signature_id`` but different ``body_hash``. Same |
| 12 | function signature, different implementation — strong candidates for |
| 13 | consolidation behind a shared abstraction. |
| 14 | |
| 15 | Git has no concept of these. Git stores file diffs; Muse stores symbol |
| 16 | identity hashes. Clone detection is a single pass over the snapshot index. |
| 17 | |
| 18 | Usage:: |
| 19 | |
| 20 | muse code clones |
| 21 | muse code clones --tier exact |
| 22 | muse code clones --tier near |
| 23 | muse code clones --kind function |
| 24 | muse code clones --language Python |
| 25 | muse code clones --file muse/core/ |
| 26 | muse code clones --exclude-same-file |
| 27 | muse code clones --commit HEAD~10 |
| 28 | muse code clones --min-cluster 3 |
| 29 | muse code clones --json |
| 30 | |
| 31 | Output:: |
| 32 | |
| 33 | Clone analysis — commit a1b2c3d4 |
| 34 | |
| 35 | Exact clones (2 clusters): |
| 36 | body_hash a1b2c3d4: |
| 37 | src/billing.py::compute_hash function |
| 38 | src/utils.py::compute_hash function |
| 39 | src/legacy.py::_hash function |
| 40 | |
| 41 | Near-clones — same signature (3 clusters): |
| 42 | signature_id e5f6a7b8: |
| 43 | src/billing.py::validate function |
| 44 | src/auth.py::validate function |
| 45 | |
| 46 | Files with most clone members: |
| 47 | src/billing.py 4 clone symbols |
| 48 | |
| 49 | Flags: |
| 50 | |
| 51 | ``--tier {exact|near|both}`` |
| 52 | Which tier to report (default: both). |
| 53 | |
| 54 | ``--kind KIND`` |
| 55 | Restrict to symbols of this kind (function, class, method, …). |
| 56 | |
| 57 | ``--language LANG`` |
| 58 | Restrict to files of this language. |
| 59 | |
| 60 | ``--file PATH`` |
| 61 | Restrict to symbols whose file path starts with PATH. |
| 62 | |
| 63 | ``--exclude-same-file`` |
| 64 | Skip clusters where every member lives in the same file. |
| 65 | |
| 66 | ``--min-cluster N`` |
| 67 | Only show clusters with at least N members (default: 2). |
| 68 | |
| 69 | ``--commit, -c REF`` |
| 70 | Analyse a historical snapshot instead of HEAD. |
| 71 | |
| 72 | ``--json`` |
| 73 | Emit results as JSON. |
| 74 | """ |
| 75 | |
| 76 | import argparse |
| 77 | import json |
| 78 | import logging |
| 79 | import pathlib |
| 80 | from typing import Literal, TypedDict |
| 81 | |
| 82 | from muse.core.errors import ExitCode |
| 83 | from muse.core.envelope import EnvelopeJson, make_envelope |
| 84 | from muse.core.indices import HashOccurrenceIndex, load_hash_occurrence |
| 85 | from muse.core.repo import require_repo |
| 86 | from muse.core.types import Manifest |
| 87 | from muse.core.refs import read_current_branch |
| 88 | from muse.core.commits import resolve_commit_ref |
| 89 | from muse.core.snapshots import get_commit_snapshot_manifest |
| 90 | from muse.core.symbol_cache import SymbolCache, load_symbol_cache |
| 91 | from muse.core.timing import start_timer |
| 92 | from muse.plugins.code._query import language_of, symbols_for_snapshot |
| 93 | from muse.plugins.code.ast_parser import SymbolRecord |
| 94 | from muse.core.validation import clamp_int, sanitize_display |
| 95 | |
| 96 | type _SymIndex = dict[str, list[tuple[str, SymbolRecord]]] |
| 97 | type _FileCountMap = dict[str, int] |
| 98 | |
| 99 | logger = logging.getLogger(__name__) |
| 100 | |
| 101 | CloneTier = Literal["exact", "near", "both"] |
| 102 | |
| 103 | # --------------------------------------------------------------------------- |
| 104 | # Typed output shapes |
| 105 | # --------------------------------------------------------------------------- |
| 106 | |
| 107 | class _MemberDict(TypedDict): |
| 108 | address: str |
| 109 | kind: str |
| 110 | language: str |
| 111 | body_hash: str |
| 112 | signature_id: str |
| 113 | content_id: str |
| 114 | |
| 115 | class _ClusterDict(TypedDict): |
| 116 | tier: str |
| 117 | hash: str |
| 118 | count: int |
| 119 | members: list[_MemberDict] |
| 120 | |
| 121 | class _FileHotspot(TypedDict): |
| 122 | file: str |
| 123 | clone_symbols: int |
| 124 | |
| 125 | class _ClonesOutputJson(EnvelopeJson): |
| 126 | """Top-level JSON emitted by ``muse code clones --json``. |
| 127 | |
| 128 | Fields |
| 129 | ------ |
| 130 | commit Short commit ID that was analysed. |
| 131 | branch Current branch name. |
| 132 | tier Which tier was requested: "exact", "near", or "both". |
| 133 | min_cluster Minimum cluster size threshold used. |
| 134 | kind_filter Symbol-kind filter, or None. |
| 135 | language_filter Language filter, or None. |
| 136 | file_filter File-prefix filter, or None. |
| 137 | exclude_same_file Whether same-file clusters were suppressed. |
| 138 | exact_clone_clusters Number of exact-clone clusters found. |
| 139 | near_clone_clusters Number of near-clone clusters found. |
| 140 | total_symbols_involved Total symbols across all clusters. |
| 141 | file_hotspots Ranked list of files by clone-member count. |
| 142 | clusters Full cluster data as :class:`_ClusterDict` entries. |
| 143 | """ |
| 144 | |
| 145 | commit: str |
| 146 | branch: str |
| 147 | tier: str |
| 148 | min_cluster: int |
| 149 | kind_filter: str | None |
| 150 | language_filter: str | None |
| 151 | file_filter: str | None |
| 152 | exclude_same_file: bool |
| 153 | exact_clone_clusters: int |
| 154 | near_clone_clusters: int |
| 155 | total_symbols_involved: int |
| 156 | file_hotspots: list[_FileHotspot] |
| 157 | clusters: list[_ClusterDict] |
| 158 | |
| 159 | # --------------------------------------------------------------------------- |
| 160 | # Core data model |
| 161 | # --------------------------------------------------------------------------- |
| 162 | |
| 163 | class _CloneCluster: |
| 164 | """A group of symbols that are duplicates of each other.""" |
| 165 | |
| 166 | def __init__( |
| 167 | self, |
| 168 | tier: CloneTier, |
| 169 | hash_value: str, |
| 170 | members: list[tuple[str, SymbolRecord]], |
| 171 | ) -> None: |
| 172 | self.tier = tier |
| 173 | self.hash_value = hash_value |
| 174 | self.members = members # (address, record) |
| 175 | |
| 176 | def to_dict(self) -> _ClusterDict: |
| 177 | return { |
| 178 | "tier": self.tier, |
| 179 | "hash": self.hash_value, |
| 180 | "count": len(self.members), |
| 181 | "members": [ |
| 182 | _MemberDict( |
| 183 | address=addr, |
| 184 | kind=rec["kind"], |
| 185 | language=language_of(addr.split("::")[0]), |
| 186 | body_hash=rec["body_hash"], |
| 187 | signature_id=rec["signature_id"], |
| 188 | content_id=rec["content_id"], |
| 189 | ) |
| 190 | for addr, rec in self.members |
| 191 | ], |
| 192 | } |
| 193 | |
| 194 | # --------------------------------------------------------------------------- |
| 195 | # Detection logic |
| 196 | # --------------------------------------------------------------------------- |
| 197 | |
| 198 | def find_clones( |
| 199 | root: pathlib.Path, |
| 200 | manifest: Manifest, |
| 201 | tier: CloneTier, |
| 202 | kind_filter: str | None, |
| 203 | min_cluster: int, |
| 204 | *, |
| 205 | language_filter: str | None = None, |
| 206 | file_filter: str | None = None, |
| 207 | exclude_same_file: bool = False, |
| 208 | cache: SymbolCache | None = None, |
| 209 | ) -> list[_CloneCluster]: |
| 210 | """Build clone clusters from *manifest*. |
| 211 | |
| 212 | Args: |
| 213 | root: Repository root (object store location). |
| 214 | manifest: Snapshot manifest: file path → SHA-256 object ID. |
| 215 | tier: Which clone tier(s) to detect: "exact", "near", "both". |
| 216 | kind_filter: If set, only analyse symbols of this kind. |
| 217 | min_cluster: Minimum cluster size to report (default 2). |
| 218 | language_filter: If set, only analyse files of this language. |
| 219 | file_filter: If set, only analyse symbols whose file path starts |
| 220 | with this prefix (e.g. "muse/core/"). |
| 221 | exclude_same_file: If True, skip clusters where all members are in |
| 222 | the same file (eliminates test-helper noise). |
| 223 | cache: Optional shared ``SymbolCache``. Pass one to avoid |
| 224 | re-parsing blobs when the caller has a warm cache. |
| 225 | """ |
| 226 | sym_map = symbols_for_snapshot( |
| 227 | root, |
| 228 | manifest, |
| 229 | kind_filter=kind_filter, |
| 230 | language_filter=language_filter, |
| 231 | cache=cache, |
| 232 | ) |
| 233 | |
| 234 | # Flatten to list of (address, record), applying file_filter. |
| 235 | all_syms: list[tuple[str, SymbolRecord]] = [ |
| 236 | (addr, rec) |
| 237 | for fp, tree in sorted(sym_map.items()) |
| 238 | if file_filter is None or fp.startswith(file_filter) |
| 239 | for addr, rec in sorted(tree.items()) |
| 240 | if rec["kind"] != "import" |
| 241 | ] |
| 242 | |
| 243 | clusters: list[_CloneCluster] = [] |
| 244 | |
| 245 | if tier in ("exact", "both"): |
| 246 | body_index: _SymIndex = {} |
| 247 | for addr, rec in all_syms: |
| 248 | body_index.setdefault(rec["body_hash"], []).append((addr, rec)) |
| 249 | for body_hash, members in sorted(body_index.items()): |
| 250 | if len(members) < min_cluster: |
| 251 | continue |
| 252 | if exclude_same_file and _all_same_file(members): |
| 253 | continue |
| 254 | clusters.append(_CloneCluster("exact", body_hash, members)) |
| 255 | |
| 256 | if tier in ("near", "both"): |
| 257 | sig_index: _SymIndex = {} |
| 258 | for addr, rec in all_syms: |
| 259 | sig_index.setdefault(rec["signature_id"], []).append((addr, rec)) |
| 260 | for sig_id, members in sorted(sig_index.items()): |
| 261 | # Near-clone: same signature, at least two DIFFERENT body hashes. |
| 262 | unique_bodies = {r["body_hash"] for _, r in members} |
| 263 | if len(members) < min_cluster or len(unique_bodies) <= 1: |
| 264 | continue |
| 265 | if exclude_same_file and _all_same_file(members): |
| 266 | continue |
| 267 | clusters.append(_CloneCluster("near", sig_id, members)) |
| 268 | |
| 269 | # Sort: largest clusters first, then by tier (exact before near), then hash. |
| 270 | clusters.sort(key=lambda c: (-len(c.members), c.tier, c.hash_value)) |
| 271 | return clusters |
| 272 | |
| 273 | def _candidate_files_from_index( |
| 274 | index: HashOccurrenceIndex, |
| 275 | min_cluster: int, |
| 276 | file_filter: str | None, |
| 277 | ) -> set[str]: |
| 278 | """Return file paths that appear in at least one clone cluster in *index*. |
| 279 | |
| 280 | Applies *file_filter* and *min_cluster* so callers only parse files that |
| 281 | will actually contribute to the output. |
| 282 | """ |
| 283 | files: set[str] = set() |
| 284 | for _body_hash, addresses in index.items(): |
| 285 | filtered = [ |
| 286 | a for a in addresses |
| 287 | if file_filter is None or a.split("::")[0].startswith(file_filter) |
| 288 | ] |
| 289 | if len(filtered) >= min_cluster: |
| 290 | for addr in filtered: |
| 291 | files.add(addr.split("::")[0]) |
| 292 | return files |
| 293 | |
| 294 | def _all_same_file(members: list[tuple[str, SymbolRecord]]) -> bool: |
| 295 | """Return True when every member lives in the same source file.""" |
| 296 | files = {addr.split("::")[0] for addr, _ in members} |
| 297 | return len(files) == 1 |
| 298 | |
| 299 | def _file_hotspots( |
| 300 | clusters: list[_CloneCluster], |
| 301 | top: int = 10, |
| 302 | ) -> list[_FileHotspot]: |
| 303 | """Rank files by the number of clone-member symbols they contain.""" |
| 304 | file_counts: _FileCountMap = {} |
| 305 | for cluster in clusters: |
| 306 | for addr, _ in cluster.members: |
| 307 | fp = addr.split("::")[0] |
| 308 | file_counts[fp] = file_counts.get(fp, 0) + 1 |
| 309 | ranked = sorted(file_counts.items(), key=lambda t: t[1], reverse=True)[:top] |
| 310 | return [_FileHotspot(file=fp, clone_symbols=cnt) for fp, cnt in ranked] |
| 311 | |
| 312 | # --------------------------------------------------------------------------- |
| 313 | # CLI registration and entry point |
| 314 | # --------------------------------------------------------------------------- |
| 315 | |
| 316 | def register(subparsers: "argparse._SubParsersAction[argparse.ArgumentParser]") -> None: |
| 317 | """Register the clones subcommand.""" |
| 318 | parser = subparsers.add_parser( |
| 319 | "clones", |
| 320 | help="Find exact and near-duplicate symbols in the committed snapshot.", |
| 321 | description=__doc__, |
| 322 | formatter_class=argparse.RawDescriptionHelpFormatter, |
| 323 | ) |
| 324 | parser.add_argument( |
| 325 | "--tier", "-t", default="both", choices=["exact", "near", "both"], |
| 326 | help="Tier to report: exact, near, or both (default: both).", |
| 327 | ) |
| 328 | parser.add_argument( |
| 329 | "--kind", "-k", default=None, metavar="KIND", dest="kind_filter", |
| 330 | help="Restrict to symbols of this kind (function, class, method, …).", |
| 331 | ) |
| 332 | parser.add_argument( |
| 333 | "--language", "-l", default=None, metavar="LANG", dest="language_filter", |
| 334 | help="Restrict to files of this language.", |
| 335 | ) |
| 336 | parser.add_argument( |
| 337 | "--file", "-f", default=None, metavar="PATH", dest="file_filter", |
| 338 | help="Restrict to symbols whose file path starts with PATH.", |
| 339 | ) |
| 340 | parser.add_argument( |
| 341 | "--exclude-same-file", action="store_true", dest="exclude_same_file", |
| 342 | help="Skip clusters where all members are in the same file.", |
| 343 | ) |
| 344 | parser.add_argument( |
| 345 | "--min-cluster", "-m", type=int, default=2, metavar="N", dest="min_cluster", |
| 346 | help="Only show clusters with at least N members (default: 2).", |
| 347 | ) |
| 348 | parser.add_argument( |
| 349 | "--commit", "-c", default=None, metavar="REF", dest="ref", |
| 350 | help="Analyse this commit instead of HEAD.", |
| 351 | ) |
| 352 | parser.add_argument( |
| 353 | "--json", "-j", action="store_true", dest="json_out", |
| 354 | help="Emit results as JSON.", |
| 355 | ) |
| 356 | parser.set_defaults(func=run) |
| 357 | |
| 358 | def run(args: argparse.Namespace) -> None: |
| 359 | """Find exact and near-duplicate symbols in the committed snapshot. |
| 360 | |
| 361 | Exact clones share the same ``body_hash`` (identical implementation). |
| 362 | Near-clones share the same ``signature_id`` but differ in body — same |
| 363 | contract, different implementation. Both are candidates for extraction |
| 364 | behind a shared abstraction. Uses content-addressed hashes from the |
| 365 | snapshot index — no AST recomputation at query time. |
| 366 | |
| 367 | Agent quickstart |
| 368 | ---------------- |
| 369 | :: |
| 370 | |
| 371 | muse code clones --json |
| 372 | muse code clones --tier exact --json |
| 373 | muse code clones --kind function --language Python --json |
| 374 | |
| 375 | JSON fields |
| 376 | ----------- |
| 377 | ref Commit ref analysed. |
| 378 | tier Clone tier: ``"exact"``, ``"near"``, or ``"all"``. |
| 379 | clusters List of clone cluster objects, each with: ``tier`` |
| 380 | (``"exact"`` or ``"near"``), ``cluster_id`` (shared hash), |
| 381 | ``count`` (number of symbols), ``symbols`` (list of addresses). |
| 382 | total_clusters Total number of clusters found. |
| 383 | total_symbols Total number of symbols across all clusters. |
| 384 | |
| 385 | Exit codes |
| 386 | ---------- |
| 387 | 0 Analysis complete. |
| 388 | 1 Invalid arguments or commit not found. |
| 389 | 2 Not inside a Muse repository. |
| 390 | """ |
| 391 | elapsed = start_timer() |
| 392 | tier: CloneTier = args.tier |
| 393 | kind_filter: str | None = args.kind_filter |
| 394 | language_filter: str | None = args.language_filter |
| 395 | file_filter: str | None = args.file_filter |
| 396 | exclude_same_file: bool = args.exclude_same_file |
| 397 | min_cluster: int = clamp_int(args.min_cluster, 1, 10000, 'min_cluster') |
| 398 | ref: str | None = args.ref |
| 399 | json_out: bool = args.json_out |
| 400 | |
| 401 | if min_cluster < 2: |
| 402 | logger.error("--min-cluster must be at least 2, got %d", min_cluster) |
| 403 | raise SystemExit(ExitCode.USER_ERROR) |
| 404 | |
| 405 | root = require_repo() |
| 406 | |
| 407 | branch = read_current_branch(root) |
| 408 | commit = resolve_commit_ref(root, branch, ref) |
| 409 | if commit is None: |
| 410 | logger.error("Commit %r not found.", ref or "HEAD") |
| 411 | raise SystemExit(ExitCode.USER_ERROR) |
| 412 | |
| 413 | manifest = get_commit_snapshot_manifest(root, commit.commit_id) or {} |
| 414 | |
| 415 | # Index fast path: when analysing HEAD with --tier exact, restrict the |
| 416 | # manifest to files that appear in the prebuilt hash_occurrence index. |
| 417 | # This skips parsing files with no clone candidates — O(K) instead of O(N). |
| 418 | # Falls back to full manifest when the index is absent or empty. |
| 419 | if ref is None and tier == "exact": |
| 420 | ho_index = load_hash_occurrence(root) |
| 421 | if ho_index: |
| 422 | candidate_files = _candidate_files_from_index(ho_index, min_cluster, file_filter) |
| 423 | if candidate_files: |
| 424 | manifest = {fp: oid for fp, oid in manifest.items() if fp in candidate_files} |
| 425 | |
| 426 | # Shared cache — each blob parsed at most once. |
| 427 | cache = load_symbol_cache(root) |
| 428 | |
| 429 | cluster_list = find_clones( |
| 430 | root, |
| 431 | manifest, |
| 432 | tier, |
| 433 | kind_filter, |
| 434 | min_cluster, |
| 435 | language_filter=language_filter, |
| 436 | file_filter=file_filter, |
| 437 | exclude_same_file=exclude_same_file, |
| 438 | cache=cache, |
| 439 | ) |
| 440 | |
| 441 | exact_clusters = [c for c in cluster_list if c.tier == "exact"] |
| 442 | near_clusters = [c for c in cluster_list if c.tier == "near"] |
| 443 | total_symbols = sum(len(c.members) for c in cluster_list) |
| 444 | hotspots = _file_hotspots(cluster_list) |
| 445 | |
| 446 | if json_out: |
| 447 | print(json.dumps(_ClonesOutputJson( |
| 448 | **make_envelope(elapsed), |
| 449 | commit=commit.commit_id, |
| 450 | branch=branch, |
| 451 | tier=tier, |
| 452 | min_cluster=min_cluster, |
| 453 | kind_filter=kind_filter, |
| 454 | language_filter=language_filter, |
| 455 | file_filter=file_filter, |
| 456 | exclude_same_file=exclude_same_file, |
| 457 | exact_clone_clusters=len(exact_clusters), |
| 458 | near_clone_clusters=len(near_clusters), |
| 459 | total_symbols_involved=total_symbols, |
| 460 | file_hotspots=hotspots, |
| 461 | clusters=[c.to_dict() for c in cluster_list], |
| 462 | ))) |
| 463 | return |
| 464 | |
| 465 | print(f"\nClone analysis — commit {commit.commit_id}") |
| 466 | if kind_filter: |
| 467 | print(f" (kind: {kind_filter})") |
| 468 | if language_filter: |
| 469 | print(f" (language: {language_filter})") |
| 470 | if file_filter: |
| 471 | print(f" (file prefix: {file_filter})") |
| 472 | if exclude_same_file: |
| 473 | print(" (same-file clusters excluded)") |
| 474 | print("─" * 62) |
| 475 | |
| 476 | if not cluster_list: |
| 477 | print("\n ✅ No clones detected.") |
| 478 | return |
| 479 | |
| 480 | if exact_clusters and tier in ("exact", "both"): |
| 481 | print(f"\nExact clones ({len(exact_clusters)} cluster(s)):") |
| 482 | for cl in exact_clusters: |
| 483 | print(f" body_hash {cl.hash_value}:") |
| 484 | for addr, rec in cl.members: |
| 485 | print(f" {sanitize_display(addr)} {rec['kind']}") |
| 486 | |
| 487 | if near_clusters and tier in ("near", "both"): |
| 488 | print(f"\nNear-clones — same signature ({len(near_clusters)} cluster(s)):") |
| 489 | for cl in near_clusters: |
| 490 | print(f" signature_id {cl.hash_value}:") |
| 491 | for addr, rec in cl.members: |
| 492 | print(f" {sanitize_display(addr)} {rec['kind']} (body {rec['body_hash']})") |
| 493 | |
| 494 | print(f"\n {len(cluster_list)} clone cluster(s), {total_symbols} total symbol(s) involved") |
| 495 | |
| 496 | if hotspots: |
| 497 | print(f"\nFiles with most clone members:") |
| 498 | for h in hotspots[:5]: |
| 499 | print(f" {sanitize_display(h['file'])} {h['clone_symbols']} clone symbol(s)") |
File History
1 commit
sha256:d11a87833d5fad6059b7662844bf5448a8911a17cce7a51811f71ad394f248eb
bump to v0.2.0rc13
Human
patch
6 days ago