test_gc_object_refs.py
file-level
1
files
1
commits
0
hotspots
0
🧊 dead
0
💥 blast risk
| 1 | """TDD: GC object-ref pruning and global object cleanup. |
| 2 | |
| 3 | After a force push or branch rewrite, old commits and their snapshots become |
| 4 | orphaned. Objects referenced exclusively by orphaned snapshots should have |
| 5 | their ref rows removed. Objects with zero remaining refs across all repos |
| 6 | should be deleted from musehub_objects and from storage. |
| 7 | |
| 8 | Coverage matrix: |
| 9 | 1. GC deletes ref row for object unreachable from all live snapshots. |
| 10 | 2. GC does NOT delete ref when object is still referenced by a live snapshot. |
| 11 | 3. GC deletes musehub_objects row when no refs remain globally. |
| 12 | 4. GC does NOT delete musehub_objects row when another repo still holds a ref. |
| 13 | 5. GC on a clean repo (no orphaned commits) is a no-op — no refs disturbed. |
| 14 | 6. GCResult fields are populated correctly. |
| 15 | """ |
| 16 | from __future__ import annotations |
| 17 | |
| 18 | import secrets |
| 19 | from datetime import datetime, timezone |
| 20 | |
| 21 | import msgpack |
| 22 | import pytest |
| 23 | import sqlalchemy as sa |
| 24 | from sqlalchemy.ext.asyncio import AsyncSession |
| 25 | |
| 26 | from muse.core.types import blob_id |
| 27 | from musehub.db.musehub_repo_models import MusehubBranch, MusehubCommit, MusehubCommitRef, MusehubObject, MusehubObjectRef, MusehubSnapshot, MusehubSnapshotRef |
| 28 | from musehub.services.musehub_gc import run_gc |
| 29 | from musehub.types.json_types import StrDict |
| 30 | from tests.factories import create_repo |
| 31 | |
| 32 | |
| 33 | def _now() -> datetime: |
| 34 | return datetime.now(tz=timezone.utc) |
| 35 | |
| 36 | |
| 37 | def _oid(seed: str) -> str: |
| 38 | return blob_id(seed.encode()) |
| 39 | |
| 40 | |
| 41 | def _manifest(mapping: StrDict) -> bytes: |
| 42 | """Encode a {path: object_id} dict as msgpack.""" |
| 43 | return msgpack.packb(mapping, use_bin_type=True) |
| 44 | |
| 45 | |
| 46 | # --------------------------------------------------------------------------- |
| 47 | # Low-level DB helpers |
| 48 | # --------------------------------------------------------------------------- |
| 49 | |
| 50 | async def _insert_object(session: AsyncSession, oid: str, repo_id: str) -> None: |
| 51 | """Insert a minimal musehub_objects row and ref (skips if already present).""" |
| 52 | exists = (await session.execute( |
| 53 | sa.select(MusehubObject.object_id).where(MusehubObject.object_id == oid) |
| 54 | )).scalar_one_or_none() |
| 55 | if exists: |
| 56 | return |
| 57 | obj = MusehubObject( |
| 58 | object_id=oid, |
| 59 | size_bytes=10, |
| 60 | path="test.md", |
| 61 | content_cache=b"test content", |
| 62 | ) |
| 63 | session.add(obj) |
| 64 | await session.flush() |
| 65 | |
| 66 | |
| 67 | async def _insert_ref(session: AsyncSession, repo_id: str, oid: str) -> None: |
| 68 | """Insert a musehub_object_refs row (idempotent).""" |
| 69 | existing = (await session.execute( |
| 70 | sa.select(MusehubObjectRef).where( |
| 71 | MusehubObjectRef.repo_id == repo_id, |
| 72 | MusehubObjectRef.object_id == oid, |
| 73 | ) |
| 74 | )).scalar_one_or_none() |
| 75 | if existing: |
| 76 | return |
| 77 | session.add(MusehubObjectRef(repo_id=repo_id, object_id=oid)) |
| 78 | await session.flush() |
| 79 | |
| 80 | |
| 81 | async def _insert_snapshot( |
| 82 | session: AsyncSession, |
| 83 | snapshot_id: str, |
| 84 | manifest: dict[str, str], |
| 85 | repo_id: str = "", |
| 86 | ) -> MusehubSnapshot: |
| 87 | snap = MusehubSnapshot( |
| 88 | snapshot_id=snapshot_id, |
| 89 | manifest_blob=_manifest(manifest), |
| 90 | entry_count=len(manifest), |
| 91 | directories=[], |
| 92 | ) |
| 93 | session.add(snap) |
| 94 | if repo_id: |
| 95 | session.add(MusehubSnapshotRef(repo_id=repo_id, snapshot_id=snapshot_id)) |
| 96 | await session.flush() |
| 97 | return snap |
| 98 | |
| 99 | |
| 100 | async def _insert_commit( |
| 101 | session: AsyncSession, |
| 102 | repo_id: str, |
| 103 | commit_id: str, |
| 104 | snapshot_id: str | None = None, |
| 105 | parent_ids: list[str] | None = None, |
| 106 | branch: str = "main", |
| 107 | ) -> MusehubCommit: |
| 108 | commit = MusehubCommit( |
| 109 | commit_id=commit_id, |
| 110 | message="test commit", |
| 111 | author="test-user", |
| 112 | branch=branch, |
| 113 | parent_ids=parent_ids or [], |
| 114 | snapshot_id=snapshot_id, |
| 115 | timestamp=_now(), |
| 116 | ) |
| 117 | session.add(commit) |
| 118 | session.add(MusehubCommitRef(repo_id=repo_id, commit_id=commit_id)) |
| 119 | await session.flush() |
| 120 | return commit |
| 121 | |
| 122 | |
| 123 | async def _insert_branch( |
| 124 | session: AsyncSession, |
| 125 | repo_id: str, |
| 126 | head_commit_id: str, |
| 127 | name: str = "main", |
| 128 | ) -> MusehubBranch: |
| 129 | from musehub.core.genesis import compute_branch_id |
| 130 | branch = MusehubBranch( |
| 131 | branch_id=compute_branch_id(repo_id, name), |
| 132 | repo_id=repo_id, |
| 133 | name=name, |
| 134 | head_commit_id=head_commit_id, |
| 135 | ) |
| 136 | session.add(branch) |
| 137 | await session.flush() |
| 138 | return branch |
| 139 | |
| 140 | |
| 141 | async def _ref_exists(session: AsyncSession, repo_id: str, oid: str) -> bool: |
| 142 | row = (await session.execute( |
| 143 | sa.select(MusehubObjectRef).where( |
| 144 | MusehubObjectRef.repo_id == repo_id, |
| 145 | MusehubObjectRef.object_id == oid, |
| 146 | ) |
| 147 | )).scalar_one_or_none() |
| 148 | return row is not None |
| 149 | |
| 150 | |
| 151 | async def _object_exists(session: AsyncSession, oid: str) -> bool: |
| 152 | row = (await session.execute( |
| 153 | sa.select(MusehubObject.object_id).where(MusehubObject.object_id == oid) |
| 154 | )).scalar_one_or_none() |
| 155 | return row is not None |
| 156 | |
| 157 | |
| 158 | # --------------------------------------------------------------------------- |
| 159 | # Test 1: GC removes ref for object only in orphaned snapshot |
| 160 | # --------------------------------------------------------------------------- |
| 161 | |
| 162 | @pytest.mark.asyncio |
| 163 | async def test_gc_removes_stale_ref_for_orphaned_object( |
| 164 | db_session: AsyncSession, |
| 165 | ) -> None: |
| 166 | """A ref row for an object that only appears in an orphaned snapshot must be deleted.""" |
| 167 | repo = await create_repo(db_session, slug="gc-stale-ref", owner="test-user-wire") |
| 168 | oid = _oid("stale-object-only-in-orphaned-snapshot") |
| 169 | |
| 170 | # Orphaned commit chain: C1 -> C2 (orphaned after force-push to C3) |
| 171 | snap_orphan_id = f"snap_{secrets.token_hex(4)}" |
| 172 | c1_id = secrets.token_hex(16) |
| 173 | c2_id = secrets.token_hex(16) |
| 174 | c3_id = secrets.token_hex(16) |
| 175 | snap_live_id = f"snap_{secrets.token_hex(4)}" |
| 176 | |
| 177 | await _insert_object(db_session, oid, repo.repo_id) |
| 178 | await _insert_ref(db_session, repo.repo_id, oid) |
| 179 | # Orphaned snapshot references the object |
| 180 | await _insert_snapshot(db_session, snap_orphan_id, {"file.md": oid}, repo_id=repo.repo_id) |
| 181 | # Live snapshot is empty (object not referenced by any live snapshot) |
| 182 | await _insert_snapshot(db_session, snap_live_id, {}, repo_id=repo.repo_id) |
| 183 | await _insert_commit(db_session, repo.repo_id, c1_id) |
| 184 | await _insert_commit(db_session, repo.repo_id, c2_id, snapshot_id=snap_orphan_id, parent_ids=[c1_id]) |
| 185 | await _insert_commit(db_session, repo.repo_id, c3_id, snapshot_id=snap_live_id) # force-push resets branch |
| 186 | await _insert_branch(db_session, repo.repo_id, c3_id) |
| 187 | await db_session.commit() |
| 188 | |
| 189 | result = await run_gc(db_session, repo.repo_id) |
| 190 | |
| 191 | assert result.object_refs_deleted >= 1, "stale ref must be deleted" |
| 192 | assert not await _ref_exists(db_session, repo.repo_id, oid), \ |
| 193 | "ref row must be gone after GC" |
| 194 | |
| 195 | |
| 196 | # --------------------------------------------------------------------------- |
| 197 | # Test 2: GC keeps ref when object is still in a live snapshot |
| 198 | # --------------------------------------------------------------------------- |
| 199 | |
| 200 | @pytest.mark.asyncio |
| 201 | async def test_gc_keeps_ref_for_live_object( |
| 202 | db_session: AsyncSession, |
| 203 | ) -> None: |
| 204 | """A ref for an object that appears in both orphaned and live snapshots must survive.""" |
| 205 | repo = await create_repo(db_session, slug="gc-live-ref", owner="test-user-wire") |
| 206 | oid = _oid("object-in-both-orphaned-and-live-snapshot") |
| 207 | |
| 208 | snap_orphan_id = f"snap_{secrets.token_hex(4)}" |
| 209 | snap_live_id = f"snap_{secrets.token_hex(4)}" |
| 210 | c_orphan_id = secrets.token_hex(16) |
| 211 | c_live_id = secrets.token_hex(16) |
| 212 | |
| 213 | await _insert_object(db_session, oid, repo.repo_id) |
| 214 | await _insert_ref(db_session, repo.repo_id, oid) |
| 215 | # Both snapshots reference the same object |
| 216 | await _insert_snapshot(db_session, snap_orphan_id, {"a.md": oid}, repo_id=repo.repo_id) |
| 217 | await _insert_snapshot(db_session, snap_live_id, {"b.md": oid}, repo_id=repo.repo_id) |
| 218 | await _insert_commit(db_session, repo.repo_id, c_orphan_id, snapshot_id=snap_orphan_id) |
| 219 | await _insert_commit(db_session, repo.repo_id, c_live_id, snapshot_id=snap_live_id) |
| 220 | await _insert_branch(db_session, repo.repo_id, c_live_id) |
| 221 | await db_session.commit() |
| 222 | |
| 223 | result = await run_gc(db_session, repo.repo_id) |
| 224 | |
| 225 | assert result.object_refs_deleted == 0, "ref to live object must not be deleted" |
| 226 | assert await _ref_exists(db_session, repo.repo_id, oid), \ |
| 227 | "ref row must survive GC when object is live" |
| 228 | |
| 229 | |
| 230 | # --------------------------------------------------------------------------- |
| 231 | # Test 3: GC deletes musehub_objects row when globally orphaned |
| 232 | # --------------------------------------------------------------------------- |
| 233 | |
| 234 | @pytest.mark.asyncio |
| 235 | async def test_gc_deletes_globally_orphaned_object( |
| 236 | db_session: AsyncSession, |
| 237 | ) -> None: |
| 238 | """After the last ref is deleted, the musehub_objects row must be deleted too.""" |
| 239 | repo = await create_repo(db_session, slug="gc-global-orphan", owner="test-user-wire") |
| 240 | oid = _oid("globally-orphaned-no-other-repo-refs") |
| 241 | |
| 242 | snap_orphan_id = f"snap_{secrets.token_hex(4)}" |
| 243 | snap_live_id = f"snap_{secrets.token_hex(4)}" |
| 244 | c_orphan_id = secrets.token_hex(16) |
| 245 | c_live_id = secrets.token_hex(16) |
| 246 | |
| 247 | await _insert_object(db_session, oid, repo.repo_id) |
| 248 | await _insert_ref(db_session, repo.repo_id, oid) |
| 249 | await _insert_snapshot(db_session, snap_orphan_id, {"file.md": oid}, repo_id=repo.repo_id) |
| 250 | await _insert_snapshot(db_session, snap_live_id, {}, repo_id=repo.repo_id) # live snapshot has no objects |
| 251 | await _insert_commit(db_session, repo.repo_id, c_orphan_id, snapshot_id=snap_orphan_id) |
| 252 | await _insert_commit(db_session, repo.repo_id, c_live_id, snapshot_id=snap_live_id) |
| 253 | await _insert_branch(db_session, repo.repo_id, c_live_id) |
| 254 | await db_session.commit() |
| 255 | |
| 256 | result = await run_gc(db_session, repo.repo_id) |
| 257 | |
| 258 | assert result.objects_deleted >= 1, "globally orphaned object must be deleted from DB" |
| 259 | assert not await _object_exists(db_session, oid), \ |
| 260 | "musehub_objects row must be gone after GC" |
| 261 | |
| 262 | |
| 263 | # --------------------------------------------------------------------------- |
| 264 | # Test 4: GC does NOT delete musehub_objects when another repo still refs it |
| 265 | # --------------------------------------------------------------------------- |
| 266 | |
| 267 | @pytest.mark.asyncio |
| 268 | async def test_gc_keeps_object_when_other_repo_holds_ref( |
| 269 | db_session: AsyncSession, |
| 270 | ) -> None: |
| 271 | """An object shared with another repo must NOT be deleted from musehub_objects.""" |
| 272 | repo_a = await create_repo(db_session, slug="gc-shared-a", owner="test-user-wire") |
| 273 | repo_b = await create_repo(db_session, slug="gc-shared-b", owner="test-user-wire") |
| 274 | oid = _oid("shared-object-two-repos") |
| 275 | |
| 276 | # Set up repo_a with orphaned snapshot referencing the object |
| 277 | snap_orphan_id = f"snap_{secrets.token_hex(4)}" |
| 278 | snap_live_id = f"snap_{secrets.token_hex(4)}" |
| 279 | c_orphan_id = secrets.token_hex(16) |
| 280 | c_live_id = secrets.token_hex(16) |
| 281 | |
| 282 | await _insert_object(db_session, oid, repo_a.repo_id) |
| 283 | # Both repos hold a ref |
| 284 | await _insert_ref(db_session, repo_a.repo_id, oid) |
| 285 | await _insert_ref(db_session, repo_b.repo_id, oid) |
| 286 | |
| 287 | await _insert_snapshot(db_session, snap_orphan_id, {"file.md": oid}, repo_id=repo_a.repo_id) |
| 288 | await _insert_snapshot(db_session, snap_live_id, {}, repo_id=repo_a.repo_id) |
| 289 | await _insert_commit(db_session, repo_a.repo_id, c_orphan_id, snapshot_id=snap_orphan_id) |
| 290 | await _insert_commit(db_session, repo_a.repo_id, c_live_id, snapshot_id=snap_live_id) |
| 291 | await _insert_branch(db_session, repo_a.repo_id, c_live_id) |
| 292 | await db_session.commit() |
| 293 | |
| 294 | # GC repo_a — prunes repo_a's ref but repo_b's ref survives |
| 295 | result = await run_gc(db_session, repo_a.repo_id) |
| 296 | |
| 297 | assert result.objects_deleted == 0, \ |
| 298 | "must not delete object still referenced by repo_b" |
| 299 | assert await _object_exists(db_session, oid), \ |
| 300 | "musehub_objects row must survive because repo_b still holds a ref" |
| 301 | assert await _ref_exists(db_session, repo_b.repo_id, oid), \ |
| 302 | "repo_b ref must be untouched by repo_a GC" |
| 303 | |
| 304 | |
| 305 | # --------------------------------------------------------------------------- |
| 306 | # Test 5: Clean repo (no orphaned commits) — no refs disturbed |
| 307 | # --------------------------------------------------------------------------- |
| 308 | |
| 309 | @pytest.mark.asyncio |
| 310 | async def test_gc_clean_repo_does_not_touch_refs( |
| 311 | db_session: AsyncSession, |
| 312 | ) -> None: |
| 313 | """GC on a fully-reachable repo must be a no-op for object refs.""" |
| 314 | repo = await create_repo(db_session, slug="gc-clean-repo", owner="test-user-wire") |
| 315 | oid = _oid("clean-repo-live-object") |
| 316 | |
| 317 | snap_id = f"snap_{secrets.token_hex(4)}" |
| 318 | c_id = secrets.token_hex(16) |
| 319 | |
| 320 | await _insert_object(db_session, oid, repo.repo_id) |
| 321 | await _insert_ref(db_session, repo.repo_id, oid) |
| 322 | await _insert_snapshot(db_session, snap_id, {"readme.md": oid}, repo_id=repo.repo_id) |
| 323 | await _insert_commit(db_session, repo.repo_id, c_id, snapshot_id=snap_id) |
| 324 | await _insert_branch(db_session, repo.repo_id, c_id) |
| 325 | await db_session.commit() |
| 326 | |
| 327 | result = await run_gc(db_session, repo.repo_id) |
| 328 | |
| 329 | assert result.commits_deleted == 0 |
| 330 | assert result.snapshots_deleted == 0 |
| 331 | assert result.object_refs_deleted == 0 |
| 332 | assert result.objects_deleted == 0 |
| 333 | |
| 334 | assert await _ref_exists(db_session, repo.repo_id, oid), \ |
| 335 | "live object ref must survive a clean GC run" |
| 336 | |
| 337 | |
| 338 | # --------------------------------------------------------------------------- |
| 339 | # Test 6: GCResult fields are correctly populated |
| 340 | # --------------------------------------------------------------------------- |
| 341 | |
| 342 | @pytest.mark.asyncio |
| 343 | async def test_gc_result_fields( |
| 344 | db_session: AsyncSession, |
| 345 | ) -> None: |
| 346 | """GCResult must accurately reflect what was deleted.""" |
| 347 | repo = await create_repo(db_session, slug="gc-result-fields", owner="test-user-wire") |
| 348 | oid = _oid("result-fields-object") |
| 349 | |
| 350 | snap_orphan_id = f"snap_{secrets.token_hex(4)}" |
| 351 | snap_live_id = f"snap_{secrets.token_hex(4)}" |
| 352 | c_orphan_id = secrets.token_hex(16) |
| 353 | c_live_id = secrets.token_hex(16) |
| 354 | |
| 355 | await _insert_object(db_session, oid, repo.repo_id) |
| 356 | await _insert_ref(db_session, repo.repo_id, oid) |
| 357 | await _insert_snapshot(db_session, snap_orphan_id, {"file.md": oid}, repo_id=repo.repo_id) |
| 358 | await _insert_snapshot(db_session, snap_live_id, {}, repo_id=repo.repo_id) |
| 359 | await _insert_commit(db_session, repo.repo_id, c_orphan_id, snapshot_id=snap_orphan_id) |
| 360 | await _insert_commit(db_session, repo.repo_id, c_live_id, snapshot_id=snap_live_id) |
| 361 | await _insert_branch(db_session, repo.repo_id, c_live_id) |
| 362 | await db_session.commit() |
| 363 | |
| 364 | result = await run_gc(db_session, repo.repo_id) |
| 365 | |
| 366 | assert result.repo_id == repo.repo_id |
| 367 | assert result.commits_deleted == 1, "one orphaned commit" |
| 368 | assert result.snapshots_deleted == 1, "one orphaned snapshot" |
| 369 | assert result.object_refs_deleted == 1, "one stale ref" |
| 370 | assert result.objects_deleted == 1, "one globally orphaned object" |
| 371 | assert result.reachable_commit_count == 1, "one live commit" |
| 372 | # errors list exists even when empty |
| 373 | assert isinstance(result.errors, list) |