#!/usr/bin/env python3 """One-time backfill: set op='add' on the oldest history entry for every symbol that was incorrectly recorded as 'modify' due to genesis commits lacking a structured_delta. Background ---------- Prior to the structured_delta genesis fix in muse, the first commit in a repo (no parent) never computed a plugin.diff(), leaving structured_delta=None. The MuseHub indexer only saw a symbol for the first time in the *second* commit that touched it — which recorded op='modify' (replace) instead of op='add' (insert). This means the provenance timeline shows 'MODIFY' at the bottom instead of 'ADD', and epoch 0 has no birth diff. This script corrects that by finding the oldest history entry per symbol and setting op='add' when it is not already 'add'. Usage ----- # Dry-run: count rows that would be changed (no writes) docker exec musehub-blue python3 /app/deploy/backfill_genesis_ops.py --dry-run # Run for all repos docker exec musehub-blue python3 /app/deploy/backfill_genesis_ops.py # Run for a single repo docker exec musehub-blue python3 /app/deploy/backfill_genesis_ops.py --repo-id # Quiet (no progress output) docker exec musehub-blue python3 /app/deploy/backfill_genesis_ops.py -q """ from __future__ import annotations import argparse import asyncio import sys import time import sqlalchemy as sa from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine from sqlalchemy.orm import sessionmaker sys.path.insert(0, "/app") sys.path.insert(0, "/tmp/devpkgs") from musehub.db.database import get_database_url from musehub.services.musehub_symbol_indexer import backfill_genesis_ops async def run(dry_run: bool, quiet: bool, repo_id: str | None) -> int: engine = create_async_engine(get_database_url(), echo=False) Session = sessionmaker(engine, class_=AsyncSession, expire_on_commit=False) async with Session() as session: if not quiet: scope = f"repo {repo_id}" if repo_id else "all repos" mode = "[DRY RUN] " if dry_run else "" print(f"{mode}Scanning {scope} for birth entries with op != 'add' …") t0 = time.monotonic() count = await backfill_genesis_ops(session, repo_id=repo_id, dry_run=dry_run) if not dry_run: await session.commit() elapsed = time.monotonic() - t0 if dry_run: print(f"Would update {count} birth entr{'y' if count == 1 else 'ies'} ({elapsed:.1f}s)") else: print(f"Updated {count} birth entr{'y' if count == 1 else 'ies'} ({elapsed:.1f}s)") await engine.dispose() return count def main() -> None: p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) p.add_argument("--dry-run", action="store_true", help="Count rows without writing") p.add_argument("--repo-id", metavar="REPO_ID", help="Limit to a single repo") p.add_argument("-q", "--quiet", action="store_true", help="Suppress progress output") args = p.parse_args() count = asyncio.run(run(dry_run=args.dry_run, quiet=args.quiet, repo_id=args.repo_id)) sys.exit(0 if count >= 0 else 1) if __name__ == "__main__": main()