gabriel / musehub public

backfill_content_ids_from_snapshots.py file-level

at sha256:3 · View file ↗ · Intel ↗

History
1 files
1 commits
0 hotspots
0 🧊 dead
0 💥 blast risk
sha256:0 fix: fall back to any indexed mpack in read_object_bytes when push mpac… · gabriel · Jun 17, 2026
1 #!/usr/bin/env python3
2 """One-time backfill: fill missing content_id values on file-level symbol
3 history entries by reading the canonical source — the snapshot manifest.
4
5 Background
6 ----------
7 When the indexer processes a file-level move/rename op (structured_delta op
8 type 'patch' with from_address), it records no new_content_id at the symbol
9 level. The snapshot stored at that commit, however, has the full
10 {path: content_id} manifest. This script reads those manifests and fills
11 the gaps.
12
13 Only file-level addresses (no '::' separator) are handled — they map directly
14 to a manifest path. Symbol-level addresses are not yet backfillable this way.
15
16 Usage
17 -----
18 # Dry-run: count rows that would be changed (no writes)
19 docker exec musehub-blue python3 /app/deploy/backfill_content_ids_from_snapshots.py --dry-run
20
21 # Run for all repos
22 docker exec musehub-blue python3 /app/deploy/backfill_content_ids_from_snapshots.py
23
24 # Run for a single repo
25 docker exec musehub-blue python3 /app/deploy/backfill_content_ids_from_snapshots.py --repo-id <repo_id>
26
27 # Quiet (no progress output)
28 docker exec musehub-blue python3 /app/deploy/backfill_content_ids_from_snapshots.py -q
29 """
30 from __future__ import annotations
31
32 import argparse
33 import asyncio
34 import sys
35 import time
36
37 import sqlalchemy as sa
38 from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine
39 from sqlalchemy.orm import sessionmaker
40
41 sys.path.insert(0, "/app")
42 sys.path.insert(0, "/tmp/devpkgs")
43
44 from musehub.db.database import get_database_url
45 from musehub.services.musehub_symbol_indexer import backfill_content_ids_from_snapshots
46
47
48 async def run(dry_run: bool, quiet: bool, repo_id: str | None) -> int:
49 engine = create_async_engine(get_database_url(), echo=False)
50 Session = sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
51
52 async with Session() as session:
53 if not quiet:
54 scope = f"repo {repo_id}" if repo_id else "all repos"
55 mode = "[DRY RUN] " if dry_run else ""
56 print(f"{mode}Scanning {scope} for file-level entries with missing content_id …")
57
58 t0 = time.monotonic()
59 count = await backfill_content_ids_from_snapshots(session, repo_id=repo_id, dry_run=dry_run)
60
61 if not dry_run:
62 await session.commit()
63
64 elapsed = time.monotonic() - t0
65
66 if dry_run:
67 print(f"Would update {count} entr{'y' if count == 1 else 'ies'} ({elapsed:.1f}s)")
68 else:
69 print(f"Updated {count} entr{'y' if count == 1 else 'ies'} ({elapsed:.1f}s)")
70
71 await engine.dispose()
72 return count
73
74
75 def main() -> None:
76 p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
77 p.add_argument("--dry-run", action="store_true", help="Count rows without writing")
78 p.add_argument("--repo-id", metavar="REPO_ID", help="Limit to a single repo")
79 p.add_argument("-q", "--quiet", action="store_true", help="Suppress progress output")
80 args = p.parse_args()
81
82 count = asyncio.run(run(dry_run=args.dry_run, quiet=args.quiet, repo_id=args.repo_id))
83 sys.exit(0 if count >= 0 else 1)
84
85
86 if __name__ == "__main__":
87 main()