#!/usr/bin/env python3 """One-time backfill: fill missing content_id values on file-level symbol history entries by reading the canonical source — the snapshot manifest. Background ---------- When the indexer processes a file-level move/rename op (structured_delta op type 'patch' with from_address), it records no new_content_id at the symbol level. The snapshot stored at that commit, however, has the full {path: content_id} manifest. This script reads those manifests and fills the gaps. Only file-level addresses (no '::' separator) are handled — they map directly to a manifest path. Symbol-level addresses are not yet backfillable this way. Usage ----- # Dry-run: count rows that would be changed (no writes) docker exec musehub-blue python3 /app/deploy/backfill_content_ids_from_snapshots.py --dry-run # Run for all repos docker exec musehub-blue python3 /app/deploy/backfill_content_ids_from_snapshots.py # Run for a single repo docker exec musehub-blue python3 /app/deploy/backfill_content_ids_from_snapshots.py --repo-id # Quiet (no progress output) docker exec musehub-blue python3 /app/deploy/backfill_content_ids_from_snapshots.py -q """ from __future__ import annotations import argparse import asyncio import sys import time import sqlalchemy as sa from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine from sqlalchemy.orm import sessionmaker sys.path.insert(0, "/app") sys.path.insert(0, "/tmp/devpkgs") from musehub.db.database import get_database_url from musehub.services.musehub_symbol_indexer import backfill_content_ids_from_snapshots async def run(dry_run: bool, quiet: bool, repo_id: str | None) -> int: engine = create_async_engine(get_database_url(), echo=False) Session = sessionmaker(engine, class_=AsyncSession, expire_on_commit=False) async with Session() as session: if not quiet: scope = f"repo {repo_id}" if repo_id else "all repos" mode = "[DRY RUN] " if dry_run else "" print(f"{mode}Scanning {scope} for file-level entries with missing content_id …") t0 = time.monotonic() count = await backfill_content_ids_from_snapshots(session, repo_id=repo_id, dry_run=dry_run) if not dry_run: await session.commit() elapsed = time.monotonic() - t0 if dry_run: print(f"Would update {count} entr{'y' if count == 1 else 'ies'} ({elapsed:.1f}s)") else: print(f"Updated {count} entr{'y' if count == 1 else 'ies'} ({elapsed:.1f}s)") await engine.dispose() return count def main() -> None: p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) p.add_argument("--dry-run", action="store_true", help="Count rows without writing") p.add_argument("--repo-id", metavar="REPO_ID", help="Limit to a single repo") p.add_argument("-q", "--quiet", action="store_true", help="Suppress progress output") args = p.parse_args() count = asyncio.run(run(dry_run=args.dry_run, quiet=args.quiet, repo_id=args.repo_id)) sys.exit(0 if count >= 0 else 1) if __name__ == "__main__": main()