gabriel/musehub — blame/sha256:3/backfill_content_ids_from_snapshots.py

1 files

1 commits

0 hotspots

0 🧊 dead

0 💥 blast risk

sha256:0 fix: fall back to any indexed mpack in read_object_bytes when push mpac… · gabriel · Jun 17, 2026

1	#!/usr/bin/env python3
2	"""One-time backfill: fill missing content_id values on file-level symbol
3	history entries by reading the canonical source — the snapshot manifest.
4
5	Background
6	----------
7	When the indexer processes a file-level move/rename op (structured_delta op
8	type 'patch' with from_address), it records no new_content_id at the symbol
9	level. The snapshot stored at that commit, however, has the full
10	{path: content_id} manifest. This script reads those manifests and fills
11	the gaps.
12
13	Only file-level addresses (no '::' separator) are handled — they map directly
14	to a manifest path. Symbol-level addresses are not yet backfillable this way.
15
16	Usage
17	-----
18	# Dry-run: count rows that would be changed (no writes)
19	docker exec musehub-blue python3 /app/deploy/backfill_content_ids_from_snapshots.py --dry-run
20
21	# Run for all repos
22	docker exec musehub-blue python3 /app/deploy/backfill_content_ids_from_snapshots.py
23
24	# Run for a single repo
25	docker exec musehub-blue python3 /app/deploy/backfill_content_ids_from_snapshots.py --repo-id <repo_id>
26
27	# Quiet (no progress output)
28	docker exec musehub-blue python3 /app/deploy/backfill_content_ids_from_snapshots.py -q
29	"""
30	from __future__ import annotations
31
32	import argparse
33	import asyncio
34	import sys
35	import time
36
37	import sqlalchemy as sa
38	from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine
39	from sqlalchemy.orm import sessionmaker
40
41	sys.path.insert(0, "/app")
42	sys.path.insert(0, "/tmp/devpkgs")
43
44	from musehub.db.database import get_database_url
45	from musehub.services.musehub_symbol_indexer import backfill_content_ids_from_snapshots
46
47
48	async def run(dry_run: bool, quiet: bool, repo_id: str \| None) -> int:
49	engine = create_async_engine(get_database_url(), echo=False)
50	Session = sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
51
52	async with Session() as session:
53	if not quiet:
54	scope = f"repo {repo_id}" if repo_id else "all repos"
55	mode = "[DRY RUN] " if dry_run else ""
56	print(f"{mode}Scanning {scope} for file-level entries with missing content_id …")
57
58	t0 = time.monotonic()
59	count = await backfill_content_ids_from_snapshots(session, repo_id=repo_id, dry_run=dry_run)
60
61	if not dry_run:
62	await session.commit()
63
64	elapsed = time.monotonic() - t0
65
66	if dry_run:
67	print(f"Would update {count} entr{'y' if count == 1 else 'ies'} ({elapsed:.1f}s)")
68	else:
69	print(f"Updated {count} entr{'y' if count == 1 else 'ies'} ({elapsed:.1f}s)")
70
71	await engine.dispose()
72	return count
73
74
75	def main() -> None:
76	p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
77	p.add_argument("--dry-run", action="store_true", help="Count rows without writing")
78	p.add_argument("--repo-id", metavar="REPO_ID", help="Limit to a single repo")
79	p.add_argument("-q", "--quiet", action="store_true", help="Suppress progress output")
80	args = p.parse_args()
81
82	count = asyncio.run(run(dry_run=args.dry_run, quiet=args.quiet, repo_id=args.repo_id))
83	sys.exit(0 if count >= 0 else 1)
84
85
86	if __name__ == "__main__":
87	main()

backfill_content_ids_from_snapshots.py file-level

`backfill_content_ids_from_snapshots.py` file-level