gabriel / musehub public
backfill_genesis_ops.py python
89 lines 3.1 KB
Raw
sha256:3ff9c9863a9891bdcde71b4a43228f66d0493e38b7cc1d09fe9eb7de774046b2 feat: add repair-commit wire endpoint (API parity with repa… Opus 4.8 minor ⚠ breaking 1 day ago
1 #!/usr/bin/env python3
2 """One-time backfill: set op='add' on the oldest history entry for every symbol
3 that was incorrectly recorded as 'modify' due to genesis commits lacking a
4 structured_delta.
5
6 Background
7 ----------
8 Prior to the structured_delta genesis fix in muse, the first commit in a repo
9 (no parent) never computed a plugin.diff(), leaving structured_delta=None.
10 The MuseHub indexer only saw a symbol for the first time in the *second* commit
11 that touched it — which recorded op='modify' (replace) instead of op='add'
12 (insert). This means the provenance timeline shows 'MODIFY' at the bottom
13 instead of 'ADD', and epoch 0 has no birth diff.
14
15 This script corrects that by finding the oldest history entry per symbol and
16 setting op='add' when it is not already 'add'.
17
18 Usage
19 -----
20 # Dry-run: count rows that would be changed (no writes)
21 docker exec musehub-blue python3 /app/deploy/backfill_genesis_ops.py --dry-run
22
23 # Run for all repos
24 docker exec musehub-blue python3 /app/deploy/backfill_genesis_ops.py
25
26 # Run for a single repo
27 docker exec musehub-blue python3 /app/deploy/backfill_genesis_ops.py --repo-id <repo_id>
28
29 # Quiet (no progress output)
30 docker exec musehub-blue python3 /app/deploy/backfill_genesis_ops.py -q
31 """
32 from __future__ import annotations
33
34 import argparse
35 import asyncio
36 import sys
37 import time
38
39 import sqlalchemy as sa
40 from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine
41 from sqlalchemy.orm import sessionmaker
42
43 sys.path.insert(0, "/app")
44 sys.path.insert(0, "/tmp/devpkgs")
45
46 from musehub.db.database import get_database_url
47 from musehub.services.musehub_symbol_indexer import backfill_genesis_ops
48
49
50 async def run(dry_run: bool, quiet: bool, repo_id: str | None) -> int:
51 engine = create_async_engine(get_database_url(), echo=False)
52 Session = sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
53
54 async with Session() as session:
55 if not quiet:
56 scope = f"repo {repo_id}" if repo_id else "all repos"
57 mode = "[DRY RUN] " if dry_run else ""
58 print(f"{mode}Scanning {scope} for birth entries with op != 'add' …")
59
60 t0 = time.monotonic()
61 count = await backfill_genesis_ops(session, repo_id=repo_id, dry_run=dry_run)
62
63 if not dry_run:
64 await session.commit()
65
66 elapsed = time.monotonic() - t0
67
68 if dry_run:
69 print(f"Would update {count} birth entr{'y' if count == 1 else 'ies'} ({elapsed:.1f}s)")
70 else:
71 print(f"Updated {count} birth entr{'y' if count == 1 else 'ies'} ({elapsed:.1f}s)")
72
73 await engine.dispose()
74 return count
75
76
77 def main() -> None:
78 p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
79 p.add_argument("--dry-run", action="store_true", help="Count rows without writing")
80 p.add_argument("--repo-id", metavar="REPO_ID", help="Limit to a single repo")
81 p.add_argument("-q", "--quiet", action="store_true", help="Suppress progress output")
82 args = p.parse_args()
83
84 count = asyncio.run(run(dry_run=args.dry_run, quiet=args.quiet, repo_id=args.repo_id))
85 sys.exit(0 if count >= 0 else 1)
86
87
88 if __name__ == "__main__":
89 main()
File History 1 commit
sha256:3ff9c9863a9891bdcde71b4a43228f66d0493e38b7cc1d09fe9eb7de774046b2 feat: add repair-commit wire endpoint (API parity with repa… Opus 4.8 minor 1 day ago