backfill_genesis_ops.py
file-level
1
files
1
commits
0
hotspots
0
🧊 dead
0
💥 blast risk
| 1 | #!/usr/bin/env python3 |
| 2 | """One-time backfill: set op='add' on the oldest history entry for every symbol |
| 3 | that was incorrectly recorded as 'modify' due to genesis commits lacking a |
| 4 | structured_delta. |
| 5 | |
| 6 | Background |
| 7 | ---------- |
| 8 | Prior to the structured_delta genesis fix in muse, the first commit in a repo |
| 9 | (no parent) never computed a plugin.diff(), leaving structured_delta=None. |
| 10 | The MuseHub indexer only saw a symbol for the first time in the *second* commit |
| 11 | that touched it — which recorded op='modify' (replace) instead of op='add' |
| 12 | (insert). This means the provenance timeline shows 'MODIFY' at the bottom |
| 13 | instead of 'ADD', and epoch 0 has no birth diff. |
| 14 | |
| 15 | This script corrects that by finding the oldest history entry per symbol and |
| 16 | setting op='add' when it is not already 'add'. |
| 17 | |
| 18 | Usage |
| 19 | ----- |
| 20 | # Dry-run: count rows that would be changed (no writes) |
| 21 | docker exec musehub-blue python3 /app/deploy/backfill_genesis_ops.py --dry-run |
| 22 | |
| 23 | # Run for all repos |
| 24 | docker exec musehub-blue python3 /app/deploy/backfill_genesis_ops.py |
| 25 | |
| 26 | # Run for a single repo |
| 27 | docker exec musehub-blue python3 /app/deploy/backfill_genesis_ops.py --repo-id <repo_id> |
| 28 | |
| 29 | # Quiet (no progress output) |
| 30 | docker exec musehub-blue python3 /app/deploy/backfill_genesis_ops.py -q |
| 31 | """ |
| 32 | from __future__ import annotations |
| 33 | |
| 34 | import argparse |
| 35 | import asyncio |
| 36 | import sys |
| 37 | import time |
| 38 | |
| 39 | import sqlalchemy as sa |
| 40 | from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine |
| 41 | from sqlalchemy.orm import sessionmaker |
| 42 | |
| 43 | sys.path.insert(0, "/app") |
| 44 | sys.path.insert(0, "/tmp/devpkgs") |
| 45 | |
| 46 | from musehub.db.database import get_database_url |
| 47 | from musehub.services.musehub_symbol_indexer import backfill_genesis_ops |
| 48 | |
| 49 | |
| 50 | async def run(dry_run: bool, quiet: bool, repo_id: str | None) -> int: |
| 51 | engine = create_async_engine(get_database_url(), echo=False) |
| 52 | Session = sessionmaker(engine, class_=AsyncSession, expire_on_commit=False) |
| 53 | |
| 54 | async with Session() as session: |
| 55 | if not quiet: |
| 56 | scope = f"repo {repo_id}" if repo_id else "all repos" |
| 57 | mode = "[DRY RUN] " if dry_run else "" |
| 58 | print(f"{mode}Scanning {scope} for birth entries with op != 'add' …") |
| 59 | |
| 60 | t0 = time.monotonic() |
| 61 | count = await backfill_genesis_ops(session, repo_id=repo_id, dry_run=dry_run) |
| 62 | |
| 63 | if not dry_run: |
| 64 | await session.commit() |
| 65 | |
| 66 | elapsed = time.monotonic() - t0 |
| 67 | |
| 68 | if dry_run: |
| 69 | print(f"Would update {count} birth entr{'y' if count == 1 else 'ies'} ({elapsed:.1f}s)") |
| 70 | else: |
| 71 | print(f"Updated {count} birth entr{'y' if count == 1 else 'ies'} ({elapsed:.1f}s)") |
| 72 | |
| 73 | await engine.dispose() |
| 74 | return count |
| 75 | |
| 76 | |
| 77 | def main() -> None: |
| 78 | p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) |
| 79 | p.add_argument("--dry-run", action="store_true", help="Count rows without writing") |
| 80 | p.add_argument("--repo-id", metavar="REPO_ID", help="Limit to a single repo") |
| 81 | p.add_argument("-q", "--quiet", action="store_true", help="Suppress progress output") |
| 82 | args = p.parse_args() |
| 83 | |
| 84 | count = asyncio.run(run(dry_run=args.dry_run, quiet=args.quiet, repo_id=args.repo_id)) |
| 85 | sys.exit(0 if count >= 0 else 1) |
| 86 | |
| 87 | |
| 88 | if __name__ == "__main__": |
| 89 | main() |