gabriel / muse public
range_diff.py python
634 lines 21.5 KB
Raw
sha256:81ae324db5ad375fbfe4834c6fcb378312cafad3cc92dec5d3e5c427306621a2 fix: remove commit_exists filter from have anchors — server… Sonnet 4.6 patch 21 days ago
1 """``muse range-diff <base>..<old> <base>..<new>`` — compare two versions of a commit series.
2
3 Shows which commits changed, which are new, and which were dropped between
4 two versions of a patch series — typically before and after a rebase.
5
6 Pairing algorithm
7 -----------------
8 1. Compute a patch-id (SHA-256 of diff content lines) for every commit in both
9 series using the same algorithm as ``muse patch-id``.
10 2. Exact patch-id matches are always paired as ``equivalent`` — the commits
11 make identical logical changes regardless of commit ID or timestamp.
12 3. Remaining (unmatched) commits are paired positionally in series order when
13 ``--creation-factor > 0.0``. The creation factor controls aggressiveness:
14 at ``1.0`` all remaining are paired; at ``0.0`` only exact matches are paired
15 and all remaining are reported as ``dropped`` or ``added``.
16 4. Leftover old commits (no new partner) → ``dropped``.
17 Leftover new commits (no old partner) → ``added``.
18
19 Output (text, default)::
20
21 = <old_short> <subject> (equivalent)
22 ! <old_short> → <new_short> (changed)
23 < <old_short> <subject> (dropped from new)
24 > <new_short> <subject> (added in new)
25
26 JSON (``--json``)::
27
28 {
29 "old_range": "base..old",
30 "new_range": "base..new",
31 "trivially_equivalent": true,
32 "old_count": 3,
33 "new_count": 3,
34 "stable": false,
35 "creation_factor": 0.6,
36 "pairs": [
37 {
38 "old": {
39 "commit_id": "sha256:...",
40 "patch_id": "sha256:...",
41 "subject": "feat: add foo",
42 "files_changed": 2
43 },
44 "new": {
45 "commit_id": "sha256:...",
46 "patch_id": "sha256:...",
47 "subject": "feat: add foo",
48 "files_changed": 2
49 },
50 "status": "equivalent"
51 }
52 ],
53 "duration_ms": 12.3,
54 "exit_code": 0
55 }
56
57 ``old`` or ``new`` is ``null`` for ``dropped`` and ``added`` entries respectively.
58
59 Flags
60 -----
61 ``--creation-factor N``
62 Float 0.0–1.0. At ``0.6`` (default) remaining (non-exact-match) commits are
63 paired positionally as ``changed``. At ``0.0`` no fuzzy pairing is performed —
64 unpaired commits are always ``dropped`` or ``added``.
65
66 ``--stable``
67 Strip trailing whitespace from diff lines before computing patch-ids
68 (cosmetic whitespace changes are ignored).
69
70 ``--json``
71 Emit a single JSON object on stdout.
72
73 Exit codes::
74
75 0 — series are trivially equivalent (all patch-ids match)
76 1 — at least one commit differs, was dropped, or was added; or usage error
77 2 — not a Muse repository
78 """
79
80 import argparse
81 import hashlib
82 import json as _json
83 import logging
84 import pathlib
85 import re
86 import sys
87 from concurrent.futures import ThreadPoolExecutor, as_completed
88 from typing import TypedDict
89
90 from muse.core.envelope import EnvelopeJson, make_envelope
91 from muse.core.errors import ExitCode
92 from muse.core.graph import ancestor_ids, iter_ancestors
93 from muse.core.object_store import read_object
94 from muse.core.repo import require_repo
95 from muse.core.refs import read_ref
96 from muse.core.refs import (
97 get_head_commit_id,
98 read_current_branch,
99 )
100 from muse.core.commits import read_commit
101 from muse.core.snapshots import read_snapshot
102 from muse.core.types import Manifest, blob_id, long_id, short_id
103 from muse.core.paths import ref_path as _ref_path
104 from muse.core.validation import sanitize_display
105 from muse.core.timing import start_timer
106
107 logger = logging.getLogger(__name__)
108
109 # ---------------------------------------------------------------------------
110 # Wire-format TypedDicts
111 # ---------------------------------------------------------------------------
112
113 class _CommitInfoDict(TypedDict):
114 commit_id: str
115 patch_id: str
116 subject: str
117 files_changed: int
118
119 class _PairDict(TypedDict, total=False):
120 old: _CommitInfoDict | None
121 new: _CommitInfoDict | None
122 status: str
123 _old_idx: int
124 _new_idx: int
125
126 class _RangeDiffResultDict(TypedDict):
127 old_range: str
128 new_range: str
129 trivially_equivalent: bool
130 old_count: int
131 new_count: int
132 stable: bool
133 creation_factor: float
134 pairs: list[_PairDict]
135
136 class _RangeDiffJson(EnvelopeJson):
137 """Stable JSON envelope for ``muse range-diff --json`` output."""
138 old_range: str
139 new_range: str
140 trivially_equivalent: bool
141 old_count: int
142 new_count: int
143 stable: bool
144 creation_factor: float
145 pairs: list[_PairDict]
146
147 # Safe ref characters: alphanumeric, underscore, slash, dot, hyphen, colon.
148 # The colon is required for ``sha256:``-prefixed commit IDs.
149 _SAFE_REF_RE = re.compile(r"^[a-zA-Z0-9_/.\-:]+$")
150 # Range pattern: allows '..' as separator in addition to safe ref chars.
151 _SAFE_RANGE_RE = re.compile(r"^[a-zA-Z0-9_/.\-:]+(\.\.)[a-zA-Z0-9_/.\-:]+$|^[a-zA-Z0-9_/.\-:]+$")
152
153 # ---------------------------------------------------------------------------
154 # Range parsing
155 # ---------------------------------------------------------------------------
156
157 def _parse_range(ref: str) -> tuple[str | None, str]:
158 """Parse ``"base..tip"`` into ``(base, tip)``.
159
160 Returns ``(None, ref)`` for plain refs without ``".."``.
161 Leading/trailing whitespace around both parts is stripped.
162 """
163 if ".." in ref:
164 parts = ref.split("..", 1)
165 return parts[0].strip(), parts[1].strip()
166 return None, ref.strip()
167
168 # ---------------------------------------------------------------------------
169 # Ref resolution
170 # ---------------------------------------------------------------------------
171
172 def _resolve_ref(root: pathlib.Path, treeish: str) -> str | None:
173 """Resolve HEAD, a branch name, or a commit ID to a canonical commit ID.
174
175 Accepts both bare 64-char hex and ``sha256:<64hex>`` commit IDs.
176 Always returns the ``sha256:``-prefixed form or ``None`` if not found.
177 """
178 if treeish.upper() == "HEAD":
179 try:
180 branch = read_current_branch(root)
181 return get_head_commit_id(root, branch)
182 except Exception:
183 return None
184
185 # Accept both bare hex and sha256:-prefixed commit IDs.
186 if re.fullmatch(r"sha256:[0-9a-f]{64}", treeish):
187 full_id = treeish
188 elif re.fullmatch(r"[0-9a-f]{64}", treeish):
189 full_id = long_id(treeish)
190 else:
191 full_id = None
192 if full_id is not None:
193 if read_commit(root, full_id) is not None:
194 return full_id
195 return None
196
197 ref_file = _ref_path(root, treeish)
198 return read_ref(ref_file)
199
200 # ---------------------------------------------------------------------------
201 # Range walking
202 # ---------------------------------------------------------------------------
203
204 def _exclude_set(root: pathlib.Path, start_id: str | None) -> set[str]:
205 """Return all commit IDs reachable from *start_id* (for range exclusion)."""
206 if start_id is None:
207 return set()
208 return ancestor_ids(root, start_id)
209
210 def _walk_range(root: pathlib.Path, base_id: str | None, tip_id: str) -> list[str]:
211 """Return commit IDs in ``base..tip``, oldest-first.
212
213 Commits reachable from *base_id* are excluded. When *base_id* equals
214 *tip_id* the result is empty (empty range).
215 """
216 if base_id is not None and base_id == tip_id:
217 return []
218
219 exclude = _exclude_set(root, base_id)
220 result = [
221 c.commit_id
222 for c in iter_ancestors(root, tip_id, first_parent_only=True, exclude=exclude)
223 ]
224 result.reverse() # oldest-first
225 return result
226
227 # ---------------------------------------------------------------------------
228 # Patch-id computation
229 # ---------------------------------------------------------------------------
230
231 def _compute_patch_id(
232 root: pathlib.Path,
233 base_manifest: Manifest,
234 target_manifest: Manifest,
235 *,
236 stable: bool = False,
237 ) -> tuple[str, int]:
238 """Compute a patch-id and files_changed count from the diff between two manifests.
239
240 Args:
241 root: Absolute repo root.
242 base_manifest: Parent commit manifest (path → object_id).
243 target_manifest: This commit manifest (path → object_id).
244 stable: When True, strip trailing whitespace before hashing.
245
246 Returns:
247 Tuple of (patch_id, files_changed) where patch_id is a
248 ``sha256:``-prefixed 64-char hex string and files_changed is the
249 count of added + removed + modified files.
250 """
251 import difflib
252
253 h = hashlib.sha256()
254 base_paths = set(base_manifest)
255 target_paths = set(target_manifest)
256 changed = sorted(
257 (target_paths - base_paths)
258 | (base_paths - target_paths)
259 | {p for p in base_paths & target_paths if base_manifest[p] != target_manifest[p]}
260 )
261
262 for path in changed:
263 if path in base_manifest:
264 raw = read_object(root, base_manifest[path])
265 base_lines = raw.decode("utf-8", errors="replace").splitlines() if raw else []
266 else:
267 base_lines = []
268
269 if path in target_manifest:
270 raw = read_object(root, target_manifest[path])
271 target_lines = raw.decode("utf-8", errors="replace").splitlines() if raw else []
272 else:
273 target_lines = []
274
275 for line in difflib.unified_diff(
276 base_lines, target_lines,
277 fromfile=f"a/{path}", tofile=f"b/{path}",
278 lineterm="",
279 ):
280 if line.startswith("+") or line.startswith("-"):
281 if stable:
282 line = line.rstrip()
283 h.update(line.encode("utf-8", errors="replace"))
284 h.update(b"\n")
285
286 return long_id(h.hexdigest()), len(changed)
287
288 def _patch_id_for_commit(
289 root: pathlib.Path,
290 commit_id: str,
291 *,
292 stable: bool,
293 ) -> tuple[str, int]:
294 """Compute the patch-id and files_changed for a single commit vs its first parent.
295
296 Args:
297 root: Absolute repo root.
298 commit_id: ``sha256:``-prefixed commit ID.
299 stable: Strip trailing whitespace before hashing when True.
300
301 Returns:
302 Tuple of (patch_id, files_changed). ``patch_id`` is a
303 ``sha256:``-prefixed hex string. ``files_changed`` counts added +
304 removed + modified files in this commit's diff vs its parent.
305 """
306 commit = read_commit(root, commit_id)
307 if commit is None:
308 return blob_id(commit_id.encode()), 0
309
310 base_manifest: Manifest = {}
311 if commit.parent_commit_id:
312 parent = read_commit(root, commit.parent_commit_id)
313 if parent:
314 snap = read_snapshot(root, parent.snapshot_id)
315 if snap:
316 base_manifest = dict(snap.manifest)
317
318 snap = read_snapshot(root, commit.snapshot_id)
319 target_manifest = dict(snap.manifest) if snap else {}
320
321 return _compute_patch_id(root, base_manifest, target_manifest, stable=stable)
322
323 def _commit_info(
324 root: pathlib.Path,
325 commit_id: str,
326 patch_id: str,
327 files_changed: int,
328 ) -> _CommitInfoDict:
329 """Build a commit info dict for JSON output.
330
331 Args:
332 root: Absolute repo root.
333 commit_id: ``sha256:``-prefixed commit ID.
334 patch_id: ``sha256:``-prefixed patch-id.
335 files_changed: Number of files added/removed/modified in this commit.
336
337 Returns:
338 Dict with ``commit_id``, ``patch_id``, ``subject``, ``files_changed``.
339 """
340 commit = read_commit(root, commit_id)
341 subject = ""
342 if commit and commit.message:
343 subject = commit.message.splitlines()[0]
344 return {
345 "commit_id": commit_id,
346 "patch_id": patch_id,
347 "subject": subject,
348 "files_changed": files_changed,
349 }
350
351 # ---------------------------------------------------------------------------
352 # Pairing
353 # ---------------------------------------------------------------------------
354
355 def _pair_series(
356 root: pathlib.Path,
357 old_ids: list[str],
358 new_ids: list[str],
359 old_pids: dict[str, str],
360 new_pids: dict[str, str],
361 old_fcs: dict[str, int],
362 new_fcs: dict[str, int],
363 creation_factor: float,
364 ) -> list[_PairDict]:
365 """Pair old and new commit series into a list of pair dicts.
366
367 Each pair has:
368 old: commit info dict or None (for "added")
369 new: commit info dict or None (for "dropped")
370 status: "equivalent" | "changed" | "dropped" | "added"
371 """
372 # Build reverse maps: patch_id → commit_id
373 old_by_pid: dict[str, str] = {v: k for k, v in old_pids.items()}
374 new_by_pid: dict[str, str] = {v: k for k, v in new_pids.items()}
375
376 used_old: set[str] = set()
377 used_new: set[str] = set()
378 pairs: list[_PairDict] = []
379
380 # Pass 1: exact patch-id matches.
381 for cid in old_ids:
382 pid = old_pids[cid]
383 if pid in new_by_pid:
384 new_cid = new_by_pid[pid]
385 if new_cid not in used_new:
386 pairs.append({
387 "old": _commit_info(root, cid, pid, old_fcs.get(cid, 0)),
388 "new": _commit_info(root, new_cid, new_pids[new_cid], new_fcs.get(new_cid, 0)),
389 "status": "equivalent",
390 "_old_idx": old_ids.index(cid),
391 "_new_idx": new_ids.index(new_cid),
392 })
393 used_old.add(cid)
394 used_new.add(new_cid)
395
396 # Pass 2: positional pairing for unmatched commits (if creation_factor > 0).
397 remaining_old = [c for c in old_ids if c not in used_old]
398 remaining_new = [c for c in new_ids if c not in used_new]
399
400 if creation_factor > 0.0:
401 n_pairs = min(len(remaining_old), len(remaining_new))
402 for i in range(n_pairs):
403 old_cid = remaining_old[i]
404 new_cid = remaining_new[i]
405 pairs.append({
406 "old": _commit_info(root, old_cid, old_pids[old_cid], old_fcs.get(old_cid, 0)),
407 "new": _commit_info(root, new_cid, new_pids[new_cid], new_fcs.get(new_cid, 0)),
408 "status": "changed",
409 "_old_idx": old_ids.index(old_cid),
410 "_new_idx": new_ids.index(new_cid),
411 })
412 used_old.add(old_cid)
413 used_new.add(new_cid)
414
415 # Dropped: old commits with no partner.
416 for cid in old_ids:
417 if cid not in used_old:
418 pairs.append({
419 "old": _commit_info(root, cid, old_pids[cid], old_fcs.get(cid, 0)),
420 "new": None,
421 "status": "dropped",
422 "_old_idx": old_ids.index(cid),
423 "_new_idx": len(new_ids),
424 })
425
426 # Added: new commits with no partner.
427 for cid in new_ids:
428 if cid not in used_new:
429 pairs.append({
430 "old": None,
431 "new": _commit_info(root, cid, new_pids[cid], new_fcs.get(cid, 0)),
432 "status": "added",
433 "_old_idx": len(old_ids),
434 "_new_idx": new_ids.index(cid),
435 })
436
437 # Sort by new series order (then old order for dropped).
438 pairs.sort(key=lambda p: (p["_new_idx"], p["_old_idx"]))
439
440 # Strip internal sort keys and populate commit info with root.
441 for p in pairs:
442 del p["_old_idx"]
443 del p["_new_idx"]
444
445 return pairs
446
447 # ---------------------------------------------------------------------------
448 # Registration
449 # ---------------------------------------------------------------------------
450
451 def register(
452 subparsers: "argparse._SubParsersAction[argparse.ArgumentParser]",
453 ) -> None:
454 """Register the ``muse range-diff`` subcommand."""
455 parser = subparsers.add_parser(
456 "range-diff",
457 help="Compare two versions of a commit series.",
458 description=__doc__,
459 formatter_class=argparse.RawDescriptionHelpFormatter,
460 )
461 parser.add_argument(
462 "old_range",
463 metavar="OLD_RANGE",
464 help="Old commit range (e.g. base..old-branch).",
465 )
466 parser.add_argument(
467 "new_range",
468 metavar="NEW_RANGE",
469 help="New commit range (e.g. base..new-branch).",
470 )
471 parser.add_argument(
472 "--creation-factor",
473 type=float,
474 default=0.6,
475 dest="creation_factor",
476 metavar="N",
477 help=(
478 "Float 0.0–1.0. How aggressively to pair unmatched commits positionally. "
479 "1.0 = pair all remaining; 0.0 = exact patch-id matches only. (default: 0.6)"
480 ),
481 )
482 parser.add_argument(
483 "--stable",
484 action="store_true",
485 help="Ignore trailing whitespace when computing patch-ids.",
486 )
487 parser.add_argument(
488 "--json", "-j",
489 action="store_true",
490 dest="json_out",
491 help="Emit a single JSON object on stdout.",
492 )
493 parser.set_defaults(func=run)
494
495 # ---------------------------------------------------------------------------
496 # Run
497 # ---------------------------------------------------------------------------
498
499 def run(args: argparse.Namespace) -> None:
500 """Compare two commit series and report differences.
501
502 Exit codes::
503
504 0 — trivially equivalent (all pairs are equivalent)
505 1 — at least one commit differs, dropped, or added
506 2 — usage error
507 """
508 elapsed = start_timer()
509 old_range_str: str = args.old_range
510 new_range_str: str = args.new_range
511 creation_factor: float = max(0.0, min(1.0, args.creation_factor))
512 stable: bool = args.stable
513 json_out: bool = args.json_out
514
515 # Validate — reject ANSI/control characters.
516 for raw in (old_range_str, new_range_str):
517 if any(ord(c) < 32 for c in raw):
518 print(f"❌ Invalid ref: {sanitize_display(raw)}", file=sys.stderr)
519 raise SystemExit(ExitCode.USER_ERROR)
520 # Validate each part of the range individually.
521 for part in raw.split(".."):
522 part = part.strip()
523 if part and not _SAFE_REF_RE.match(part):
524 print(f"❌ Invalid ref: {sanitize_display(raw)}", file=sys.stderr)
525 raise SystemExit(ExitCode.USER_ERROR)
526
527 root = require_repo()
528
529 # Parse ranges.
530 old_base_str, old_tip_str = _parse_range(old_range_str)
531 new_base_str, new_tip_str = _parse_range(new_range_str)
532
533 # Resolve refs.
534 def _resolve(ref: str, label: str) -> str | None:
535 resolved = _resolve_ref(root, ref)
536 if resolved is None:
537 print(f"❌ Cannot resolve ref: {sanitize_display(ref)} ({label})", file=sys.stderr)
538 return resolved
539
540 old_base_id = _resolve(old_base_str, "old base") if old_base_str else None
541 old_tip_id = _resolve(old_tip_str, "old tip")
542 new_base_id = _resolve(new_base_str, "new base") if new_base_str else None
543 new_tip_id = _resolve(new_tip_str, "new tip")
544
545 if old_tip_id is None or new_tip_id is None:
546 raise SystemExit(ExitCode.USER_ERROR)
547 if old_base_str and old_base_id is None:
548 raise SystemExit(ExitCode.USER_ERROR)
549 if new_base_str and new_base_id is None:
550 raise SystemExit(ExitCode.USER_ERROR)
551
552 # Collect commit series.
553 old_ids = _walk_range(root, old_base_id, old_tip_id)
554 new_ids = _walk_range(root, new_base_id, new_tip_id)
555
556 # Compute patch-ids (and files_changed counts) in parallel.
557 old_pids: dict[str, str] = {}
558 new_pids: dict[str, str] = {}
559 old_fcs: dict[str, int] = {}
560 new_fcs: dict[str, int] = {}
561
562 all_ids = [(cid, "old") for cid in old_ids] + [(cid, "new") for cid in new_ids]
563
564 def _compute(item: tuple[str, str]) -> tuple[str, str, str, int]:
565 cid, side = item
566 pid, fc = _patch_id_for_commit(root, cid, stable=stable)
567 return cid, side, pid, fc
568
569 with ThreadPoolExecutor(max_workers=min(8, max(1, len(all_ids)))) as pool:
570 for cid, side, pid, fc in pool.map(_compute, all_ids):
571 if side == "old":
572 old_pids[cid] = pid
573 old_fcs[cid] = fc
574 else:
575 new_pids[cid] = pid
576 new_fcs[cid] = fc
577
578 # Pair the series.
579 pairs = _pair_series(
580 root, old_ids, new_ids,
581 old_pids, new_pids,
582 old_fcs, new_fcs,
583 creation_factor,
584 )
585
586 trivially_equivalent = all(p["status"] == "equivalent" for p in pairs)
587 exit_code = 0 if trivially_equivalent else int(ExitCode.USER_ERROR)
588
589 result: _RangeDiffResultDict = {
590 "old_range": old_range_str,
591 "new_range": new_range_str,
592 "trivially_equivalent": trivially_equivalent,
593 "old_count": len(old_ids),
594 "new_count": len(new_ids),
595 "stable": stable,
596 "creation_factor": creation_factor,
597 "pairs": pairs,
598 }
599
600 if json_out:
601 print(_json.dumps(_RangeDiffJson(
602 **make_envelope(elapsed, exit_code=exit_code),
603 **result,
604 )))
605 else:
606 _print_text(result)
607
608 if not trivially_equivalent:
609 raise SystemExit(ExitCode.USER_ERROR)
610
611 def _print_text(result: _RangeDiffResultDict) -> None:
612 """Print a human-readable range-diff summary."""
613 print(f"# range-diff {sanitize_display(result['old_range'])} → {sanitize_display(result['new_range'])}")
614 print()
615
616 if not result["pairs"]:
617 print("(empty — both series are empty)")
618 return
619
620 for p in result["pairs"]:
621 status = p["status"]
622 if status == "equivalent":
623 old = p["old"]
624 print(f"= {short_id(old['commit_id'])} {sanitize_display(old['subject'])}")
625 elif status == "changed":
626 old = p["old"]
627 new = p["new"]
628 print(f"! {short_id(old['commit_id'])} → {short_id(new['commit_id'])} {sanitize_display(new['subject'])}")
629 elif status == "dropped":
630 old = p["old"]
631 print(f"< {short_id(old['commit_id'])} {sanitize_display(old['subject'])}")
632 elif status == "added":
633 new = p["new"]
634 print(f"> {short_id(new['commit_id'])} {sanitize_display(new['subject'])}")
File History 4 commits
sha256:81ae324db5ad375fbfe4834c6fcb378312cafad3cc92dec5d3e5c427306621a2 fix: remove commit_exists filter from have anchors — server… Sonnet 4.6 patch 21 days ago
sha256:36c3cb3e76619d4c30a6d9bf81b5ec4ff148e30dcfed913e3114ca7b43b81c7e fix: rename objects→blobs in push client and all stale test… Sonnet 4.6 patch 23 days ago
sha256:c06a9b9b9fee26c68ea725b44d54b2c0a171301ce9de746d5b656617b4463a9a fix: repair four test failures from post-migration audit Sonnet 4.6 patch 29 days ago
sha256:1900655993c83c4107067375548a7be823e471d2515830842f1a12cba4bd3cdf fix: unified object store migration — idempotent writes, JS… Sonnet 4.6 minor 29 days ago