backfill_content_ids_from_snapshots.py
file-level
1
files
1
commits
0
hotspots
0
🧊 dead
0
💥 blast risk
| 1 | #!/usr/bin/env python3 |
| 2 | """One-time backfill: fill missing content_id values on file-level symbol |
| 3 | history entries by reading the canonical source — the snapshot manifest. |
| 4 | |
| 5 | Background |
| 6 | ---------- |
| 7 | When the indexer processes a file-level move/rename op (structured_delta op |
| 8 | type 'patch' with from_address), it records no new_content_id at the symbol |
| 9 | level. The snapshot stored at that commit, however, has the full |
| 10 | {path: content_id} manifest. This script reads those manifests and fills |
| 11 | the gaps. |
| 12 | |
| 13 | Only file-level addresses (no '::' separator) are handled — they map directly |
| 14 | to a manifest path. Symbol-level addresses are not yet backfillable this way. |
| 15 | |
| 16 | Usage |
| 17 | ----- |
| 18 | # Dry-run: count rows that would be changed (no writes) |
| 19 | docker exec musehub-blue python3 /app/deploy/backfill_content_ids_from_snapshots.py --dry-run |
| 20 | |
| 21 | # Run for all repos |
| 22 | docker exec musehub-blue python3 /app/deploy/backfill_content_ids_from_snapshots.py |
| 23 | |
| 24 | # Run for a single repo |
| 25 | docker exec musehub-blue python3 /app/deploy/backfill_content_ids_from_snapshots.py --repo-id <repo_id> |
| 26 | |
| 27 | # Quiet (no progress output) |
| 28 | docker exec musehub-blue python3 /app/deploy/backfill_content_ids_from_snapshots.py -q |
| 29 | """ |
| 30 | from __future__ import annotations |
| 31 | |
| 32 | import argparse |
| 33 | import asyncio |
| 34 | import sys |
| 35 | import time |
| 36 | |
| 37 | import sqlalchemy as sa |
| 38 | from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine |
| 39 | from sqlalchemy.orm import sessionmaker |
| 40 | |
| 41 | sys.path.insert(0, "/app") |
| 42 | sys.path.insert(0, "/tmp/devpkgs") |
| 43 | |
| 44 | from musehub.db.database import get_database_url |
| 45 | from musehub.services.musehub_symbol_indexer import backfill_content_ids_from_snapshots |
| 46 | |
| 47 | |
| 48 | async def run(dry_run: bool, quiet: bool, repo_id: str | None) -> int: |
| 49 | engine = create_async_engine(get_database_url(), echo=False) |
| 50 | Session = sessionmaker(engine, class_=AsyncSession, expire_on_commit=False) |
| 51 | |
| 52 | async with Session() as session: |
| 53 | if not quiet: |
| 54 | scope = f"repo {repo_id}" if repo_id else "all repos" |
| 55 | mode = "[DRY RUN] " if dry_run else "" |
| 56 | print(f"{mode}Scanning {scope} for file-level entries with missing content_id …") |
| 57 | |
| 58 | t0 = time.monotonic() |
| 59 | count = await backfill_content_ids_from_snapshots(session, repo_id=repo_id, dry_run=dry_run) |
| 60 | |
| 61 | if not dry_run: |
| 62 | await session.commit() |
| 63 | |
| 64 | elapsed = time.monotonic() - t0 |
| 65 | |
| 66 | if dry_run: |
| 67 | print(f"Would update {count} entr{'y' if count == 1 else 'ies'} ({elapsed:.1f}s)") |
| 68 | else: |
| 69 | print(f"Updated {count} entr{'y' if count == 1 else 'ies'} ({elapsed:.1f}s)") |
| 70 | |
| 71 | await engine.dispose() |
| 72 | return count |
| 73 | |
| 74 | |
| 75 | def main() -> None: |
| 76 | p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) |
| 77 | p.add_argument("--dry-run", action="store_true", help="Count rows without writing") |
| 78 | p.add_argument("--repo-id", metavar="REPO_ID", help="Limit to a single repo") |
| 79 | p.add_argument("-q", "--quiet", action="store_true", help="Suppress progress output") |
| 80 | args = p.parse_args() |
| 81 | |
| 82 | count = asyncio.run(run(dry_run=args.dry_run, quiet=args.quiet, repo_id=args.repo_id)) |
| 83 | sys.exit(0 if count >= 0 else 1) |
| 84 | |
| 85 | |
| 86 | if __name__ == "__main__": |
| 87 | main() |