"""Count how many objects in R2 are still zlib-compressed. Prints a progress line every 100 objects and a final summary. Run: docker exec musehub-blue python3 /app/deploy/count_compressed.py """ import sys import sqlalchemy as sa from sqlalchemy import create_engine from sqlalchemy.orm import Session from musehub.config import settings from musehub.db.musehub_repo_models import MusehubObject from musehub.storage import get_backend ZLIB_MAGIC = (b"\x78\x01", b"\x78\x9c", b"\x78\xda", b"\x78\x5e") def check_header(backend, oid: str) -> bool: """Return True if the object starts with a zlib magic header.""" client = backend._get_client() key = backend._key(oid) try: resp = client.get_object(Bucket=backend._bucket, Key=key, Range="bytes=0-1") header = resp["Body"].read(2) return header in ZLIB_MAGIC except Exception as e: print(f" ERROR {oid[:20]}: {e}", flush=True) return False def main() -> None: # Sync engine — no asyncio, no threads, no surprises. sync_url = settings.database_url.replace("+asyncpg", "").replace("+aiosqlite", "") engine = create_engine(sync_url) backend = get_backend() with Session(engine) as session: rows = session.execute( sa.select(MusehubObject.object_id) .where( MusehubObject.storage_uri.like("s3://%"), MusehubObject.deleted_at.is_(None), ) .order_by(MusehubObject.object_id) ).scalars().all() total = len(rows) print(f"Total objects: {total}", flush=True) compressed = 0 plain = 0 for i, oid in enumerate(rows, 1): if check_header(backend, oid): compressed += 1 else: plain += 1 if i % 100 == 0 or i == total: print(f" [{i}/{total}] plain={plain} compressed={compressed}", flush=True) print(f"\nDone. plain={plain} compressed={compressed}", flush=True) sys.exit(1 if compressed else 0) if __name__ == "__main__": main()