test_gc.py
file-level
1
files
1
commits
0
hotspots
0
π§ dead
0
π₯ blast risk
| 1 | """Section 32 β Garbage Collector: 7-layer test suite. |
| 2 | |
| 3 | Covers: |
| 4 | musehub/services/musehub_gc.py β GCResult, run_gc |
| 5 | musehub/api/routes/wire.py β _run_gc_async (fire-and-forget background task) |
| 6 | |
| 7 | Key behaviour: |
| 8 | - run_gc collects all branch head commit IDs then BFS through parent_ids |
| 9 | - Commits reachable from any branch head are preserved |
| 10 | - Orphaned commits (not reachable from any branch) are deleted |
| 11 | - Snapshots referenced only by orphaned commits are deleted |
| 12 | - Snapshots also referenced by a reachable commit are preserved |
| 13 | - Repos with no branch heads β GC skips (returns empty result) |
| 14 | - Repos already clean β GC is a no-op (returns 0 deletes) |
| 15 | - run_gc commits the session itself |
| 16 | """ |
| 17 | from __future__ import annotations |
| 18 | |
| 19 | import secrets |
| 20 | from datetime import datetime, timezone |
| 21 | |
| 22 | import pytest |
| 23 | from sqlalchemy import select |
| 24 | from sqlalchemy.ext.asyncio import AsyncSession |
| 25 | |
| 26 | from musehub.core.genesis import compute_identity_id, compute_repo_id |
| 27 | from musehub.db.musehub_repo_models import ( |
| 28 | MusehubBranch, |
| 29 | MusehubCommit, |
| 30 | MusehubCommitRef, |
| 31 | MusehubRepo, |
| 32 | MusehubSnapshot, |
| 33 | MusehubSnapshotRef, |
| 34 | ) |
| 35 | from musehub.services.musehub_gc import GCResult, run_gc |
| 36 | |
| 37 | |
| 38 | # ββ helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| 39 | |
| 40 | |
| 41 | def _uid() -> str: |
| 42 | return secrets.token_hex(16) |
| 43 | |
| 44 | |
| 45 | def _cid() -> str: |
| 46 | return secrets.token_hex(8) |
| 47 | |
| 48 | |
| 49 | def _now() -> datetime: |
| 50 | return datetime.now(tz=timezone.utc) |
| 51 | |
| 52 | |
| 53 | async def _db_repo(session: AsyncSession) -> MusehubRepo: |
| 54 | slug = f"gc-repo-{_uid()[:8]}" |
| 55 | created_at = _now() |
| 56 | owner_id = compute_identity_id(b"testuser") |
| 57 | repo = MusehubRepo( |
| 58 | repo_id=compute_repo_id(owner_id, slug, "code", created_at.isoformat()), |
| 59 | name=slug, |
| 60 | slug=slug, |
| 61 | owner="testuser", |
| 62 | owner_user_id=owner_id, |
| 63 | visibility="private", |
| 64 | created_at=created_at, |
| 65 | updated_at=created_at, |
| 66 | ) |
| 67 | session.add(repo) |
| 68 | await session.flush() |
| 69 | return repo |
| 70 | |
| 71 | |
| 72 | async def _db_branch( |
| 73 | session: AsyncSession, |
| 74 | repo_id: str, |
| 75 | *, |
| 76 | name: str = "dev", |
| 77 | head_commit_id: str | None = None, |
| 78 | ) -> MusehubBranch: |
| 79 | branch = MusehubBranch( |
| 80 | branch_id=_uid(), |
| 81 | repo_id=repo_id, |
| 82 | name=name, |
| 83 | head_commit_id=head_commit_id, |
| 84 | ) |
| 85 | session.add(branch) |
| 86 | await session.flush() |
| 87 | return branch |
| 88 | |
| 89 | |
| 90 | async def _db_commit( |
| 91 | session: AsyncSession, |
| 92 | repo_id: str, |
| 93 | *, |
| 94 | commit_id: str | None = None, |
| 95 | parent_ids: list[str] | None = None, |
| 96 | snapshot_id: str | None = None, |
| 97 | branch: str = "dev", |
| 98 | ) -> MusehubCommit: |
| 99 | commit = MusehubCommit( |
| 100 | commit_id=commit_id or _cid(), |
| 101 | branch=branch, |
| 102 | parent_ids=parent_ids or [], |
| 103 | message="test commit", |
| 104 | author="testuser", |
| 105 | timestamp=_now(), |
| 106 | snapshot_id=snapshot_id, |
| 107 | ) |
| 108 | session.add(commit) |
| 109 | session.add(MusehubCommitRef(repo_id=repo_id, commit_id=commit.commit_id)) |
| 110 | await session.flush() |
| 111 | return commit |
| 112 | |
| 113 | |
| 114 | async def _db_snapshot( |
| 115 | session: AsyncSession, |
| 116 | repo_id: str, |
| 117 | *, |
| 118 | snapshot_id: str | None = None, |
| 119 | ) -> MusehubSnapshot: |
| 120 | snap = MusehubSnapshot( |
| 121 | snapshot_id=snapshot_id or _cid(), |
| 122 | manifest_blob=b"", |
| 123 | created_at=_now(), |
| 124 | ) |
| 125 | session.add(snap) |
| 126 | session.add(MusehubSnapshotRef(repo_id=repo_id, snapshot_id=snap.snapshot_id)) |
| 127 | await session.flush() |
| 128 | return snap |
| 129 | |
| 130 | |
| 131 | # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| 132 | # Layer 1 β Unit |
| 133 | # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| 134 | |
| 135 | |
| 136 | class TestUnitGC: |
| 137 | def test_gcresult_defaults(self) -> None: |
| 138 | r = GCResult(repo_id="repo-abc") |
| 139 | assert r.commits_deleted == 0 |
| 140 | assert r.snapshots_deleted == 0 |
| 141 | assert r.reachable_commit_count == 0 |
| 142 | assert r.errors == [] |
| 143 | |
| 144 | def test_gcresult_is_dataclass(self) -> None: |
| 145 | import dataclasses |
| 146 | assert dataclasses.is_dataclass(GCResult) |
| 147 | |
| 148 | def test_gcresult_with_values(self) -> None: |
| 149 | r = GCResult( |
| 150 | repo_id="abc", |
| 151 | commits_deleted=5, |
| 152 | snapshots_deleted=3, |
| 153 | reachable_commit_count=10, |
| 154 | ) |
| 155 | assert r.commits_deleted == 5 |
| 156 | assert r.snapshots_deleted == 3 |
| 157 | assert r.reachable_commit_count == 10 |
| 158 | |
| 159 | def test_gcresult_errors_is_list(self) -> None: |
| 160 | r = GCResult(repo_id="x") |
| 161 | r.errors.append("something failed") |
| 162 | assert len(r.errors) == 1 |
| 163 | |
| 164 | def test_bfs_reachability_logic(self) -> None: |
| 165 | """Verify BFS logic in isolation using the same algorithm as run_gc.""" |
| 166 | # Simulate a simple commit graph: |
| 167 | # head β c2 β c1 β root |
| 168 | # β |
| 169 | # orphan (not reachable from head) |
| 170 | all_commits = { |
| 171 | "head": ["c2"], |
| 172 | "c2": ["c1"], |
| 173 | "c1": ["root"], |
| 174 | "root": [], |
| 175 | "orphan": ["root"], # orphan points to root but no branch points to it |
| 176 | } |
| 177 | heads = ["head"] |
| 178 | |
| 179 | reachable: set[str] = set() |
| 180 | queue = list(heads) |
| 181 | while queue: |
| 182 | cid = queue.pop() |
| 183 | if cid in reachable or cid not in all_commits: |
| 184 | continue |
| 185 | reachable.add(cid) |
| 186 | queue.extend(all_commits[cid]) |
| 187 | |
| 188 | assert "head" in reachable |
| 189 | assert "c2" in reachable |
| 190 | assert "c1" in reachable |
| 191 | assert "root" in reachable |
| 192 | assert "orphan" not in reachable |
| 193 | |
| 194 | def test_bfs_handles_merge_commits(self) -> None: |
| 195 | """Merge commits have two parents β BFS must traverse both.""" |
| 196 | all_commits = { |
| 197 | "merge": ["left", "right"], |
| 198 | "left": ["base"], |
| 199 | "right": ["base"], |
| 200 | "base": [], |
| 201 | } |
| 202 | heads = ["merge"] |
| 203 | |
| 204 | reachable: set[str] = set() |
| 205 | queue = list(heads) |
| 206 | while queue: |
| 207 | cid = queue.pop() |
| 208 | if cid in reachable or cid not in all_commits: |
| 209 | continue |
| 210 | reachable.add(cid) |
| 211 | queue.extend(all_commits[cid]) |
| 212 | |
| 213 | assert reachable == {"merge", "left", "right", "base"} |
| 214 | |
| 215 | def test_bfs_handles_cycle_guard(self) -> None: |
| 216 | """Circular parent references must not infinite-loop (already-visited guard).""" |
| 217 | all_commits = {"a": ["b"], "b": ["a"]} |
| 218 | heads = ["a"] |
| 219 | |
| 220 | reachable: set[str] = set() |
| 221 | queue = list(heads) |
| 222 | while queue: |
| 223 | cid = queue.pop() |
| 224 | if cid in reachable or cid not in all_commits: |
| 225 | continue |
| 226 | reachable.add(cid) |
| 227 | queue.extend(all_commits[cid]) |
| 228 | |
| 229 | assert reachable == {"a", "b"} |
| 230 | |
| 231 | |
| 232 | # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| 233 | # Layer 2 β Integration |
| 234 | # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| 235 | |
| 236 | |
| 237 | class TestIntegrationGC: |
| 238 | async def test_gc_clean_repo_no_deletes(self, db_session: AsyncSession) -> None: |
| 239 | repo = await _db_repo(db_session) |
| 240 | c1 = await _db_commit(db_session, repo.repo_id) |
| 241 | await _db_branch(db_session, repo.repo_id, head_commit_id=c1.commit_id) |
| 242 | await db_session.commit() |
| 243 | |
| 244 | result = await run_gc(db_session, repo.repo_id) |
| 245 | assert result.commits_deleted == 0 |
| 246 | assert result.snapshots_deleted == 0 |
| 247 | assert result.reachable_commit_count == 1 |
| 248 | |
| 249 | async def test_gc_no_branch_heads_skips(self, db_session: AsyncSession) -> None: |
| 250 | repo = await _db_repo(db_session) |
| 251 | # Branch with no head_commit_id |
| 252 | await _db_branch(db_session, repo.repo_id, head_commit_id=None) |
| 253 | await _db_commit(db_session, repo.repo_id) |
| 254 | await db_session.commit() |
| 255 | |
| 256 | result = await run_gc(db_session, repo.repo_id) |
| 257 | # No heads β GC skips immediately; nothing deleted |
| 258 | assert result.commits_deleted == 0 |
| 259 | assert result.reachable_commit_count == 0 |
| 260 | |
| 261 | async def test_gc_deletes_orphaned_commit(self, db_session: AsyncSession) -> None: |
| 262 | repo = await _db_repo(db_session) |
| 263 | live = await _db_commit(db_session, repo.repo_id) |
| 264 | orphan = await _db_commit(db_session, repo.repo_id) |
| 265 | await _db_branch(db_session, repo.repo_id, head_commit_id=live.commit_id) |
| 266 | await db_session.commit() |
| 267 | |
| 268 | result = await run_gc(db_session, repo.repo_id) |
| 269 | assert result.commits_deleted == 1 |
| 270 | assert result.reachable_commit_count == 1 |
| 271 | |
| 272 | # Verify orphan is gone |
| 273 | row = await db_session.get(MusehubCommit, orphan.commit_id) |
| 274 | assert row is None |
| 275 | |
| 276 | async def test_gc_preserves_reachable_chain(self, db_session: AsyncSession) -> None: |
| 277 | repo = await _db_repo(db_session) |
| 278 | root_cid = _cid() |
| 279 | mid_cid = _cid() |
| 280 | head_cid = _cid() |
| 281 | root = await _db_commit(db_session, repo.repo_id, commit_id=root_cid) |
| 282 | mid = await _db_commit( |
| 283 | db_session, repo.repo_id, commit_id=mid_cid, parent_ids=[root_cid] |
| 284 | ) |
| 285 | head = await _db_commit( |
| 286 | db_session, repo.repo_id, commit_id=head_cid, parent_ids=[mid_cid] |
| 287 | ) |
| 288 | await _db_branch(db_session, repo.repo_id, head_commit_id=head_cid) |
| 289 | await db_session.commit() |
| 290 | |
| 291 | result = await run_gc(db_session, repo.repo_id) |
| 292 | assert result.commits_deleted == 0 |
| 293 | assert result.reachable_commit_count == 3 |
| 294 | |
| 295 | for cid in [root_cid, mid_cid, head_cid]: |
| 296 | row = await db_session.get(MusehubCommit, cid) |
| 297 | assert row is not None |
| 298 | |
| 299 | async def test_gc_deletes_orphaned_snapshot(self, db_session: AsyncSession) -> None: |
| 300 | repo = await _db_repo(db_session) |
| 301 | snap = await _db_snapshot(db_session, repo.repo_id) |
| 302 | live = await _db_commit(db_session, repo.repo_id) |
| 303 | orphan = await _db_commit( |
| 304 | db_session, repo.repo_id, snapshot_id=snap.snapshot_id |
| 305 | ) |
| 306 | await _db_branch(db_session, repo.repo_id, head_commit_id=live.commit_id) |
| 307 | await db_session.commit() |
| 308 | |
| 309 | result = await run_gc(db_session, repo.repo_id) |
| 310 | assert result.commits_deleted == 1 |
| 311 | assert result.snapshots_deleted == 1 |
| 312 | |
| 313 | snap_row = await db_session.get(MusehubSnapshot, snap.snapshot_id) |
| 314 | assert snap_row is None |
| 315 | |
| 316 | async def test_gc_preserves_snapshot_referenced_by_live_commit( |
| 317 | self, db_session: AsyncSession |
| 318 | ) -> None: |
| 319 | repo = await _db_repo(db_session) |
| 320 | shared_snap = await _db_snapshot(db_session, repo.repo_id) |
| 321 | # Both reachable and orphan point to same snapshot |
| 322 | live_cid = _cid() |
| 323 | orphan_cid = _cid() |
| 324 | await _db_commit( |
| 325 | db_session, repo.repo_id, |
| 326 | commit_id=live_cid, snapshot_id=shared_snap.snapshot_id |
| 327 | ) |
| 328 | await _db_commit( |
| 329 | db_session, repo.repo_id, |
| 330 | commit_id=orphan_cid, snapshot_id=shared_snap.snapshot_id |
| 331 | ) |
| 332 | await _db_branch(db_session, repo.repo_id, head_commit_id=live_cid) |
| 333 | await db_session.commit() |
| 334 | |
| 335 | result = await run_gc(db_session, repo.repo_id) |
| 336 | assert result.commits_deleted == 1 # orphan commit removed |
| 337 | assert result.snapshots_deleted == 0 # snapshot still used by live commit |
| 338 | |
| 339 | snap_row = await db_session.get(MusehubSnapshot, shared_snap.snapshot_id) |
| 340 | assert snap_row is not None |
| 341 | |
| 342 | async def test_gc_multiple_branches_union_of_reachable( |
| 343 | self, db_session: AsyncSession |
| 344 | ) -> None: |
| 345 | repo = await _db_repo(db_session) |
| 346 | c1 = await _db_commit(db_session, repo.repo_id) |
| 347 | c2 = await _db_commit(db_session, repo.repo_id) |
| 348 | await _db_branch(db_session, repo.repo_id, name="dev", head_commit_id=c1.commit_id) |
| 349 | await _db_branch(db_session, repo.repo_id, name="main", head_commit_id=c2.commit_id) |
| 350 | await db_session.commit() |
| 351 | |
| 352 | result = await run_gc(db_session, repo.repo_id) |
| 353 | assert result.commits_deleted == 0 |
| 354 | assert result.reachable_commit_count == 2 |
| 355 | |
| 356 | async def test_gc_returns_gcresult(self, db_session: AsyncSession) -> None: |
| 357 | repo = await _db_repo(db_session) |
| 358 | c = await _db_commit(db_session, repo.repo_id) |
| 359 | await _db_branch(db_session, repo.repo_id, head_commit_id=c.commit_id) |
| 360 | await db_session.commit() |
| 361 | |
| 362 | result = await run_gc(db_session, repo.repo_id) |
| 363 | assert isinstance(result, GCResult) |
| 364 | assert result.repo_id == repo.repo_id |
| 365 | |
| 366 | async def test_gc_only_affects_target_repo(self, db_session: AsyncSession) -> None: |
| 367 | repo1 = await _db_repo(db_session) |
| 368 | repo2 = await _db_repo(db_session) |
| 369 | live = await _db_commit(db_session, repo1.repo_id) |
| 370 | r2_orphan = await _db_commit(db_session, repo2.repo_id) |
| 371 | await _db_branch(db_session, repo1.repo_id, head_commit_id=live.commit_id) |
| 372 | await _db_branch(db_session, repo2.repo_id, head_commit_id=None) |
| 373 | await db_session.commit() |
| 374 | |
| 375 | # Run GC on repo1 only |
| 376 | result = await run_gc(db_session, repo1.repo_id) |
| 377 | assert result.commits_deleted == 0 |
| 378 | |
| 379 | # repo2's commit must still exist |
| 380 | row = await db_session.get(MusehubCommit, r2_orphan.commit_id) |
| 381 | assert row is not None |
| 382 | |
| 383 | |
| 384 | # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| 385 | # Layer 3 β End-to-End |
| 386 | # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| 387 | |
| 388 | |
| 389 | class TestE2EGC: |
| 390 | """GC has no direct HTTP endpoint; _run_gc_async is fire-and-forget after push. |
| 391 | We test GC end-to-end by calling run_gc directly after setting up realistic |
| 392 | repo state and verifying full database consistency. |
| 393 | """ |
| 394 | |
| 395 | async def test_e2e_gc_linear_history(self, db_session: AsyncSession) -> None: |
| 396 | """Full pipeline: 5-commit linear chain, 2 orphans, run GC, verify state.""" |
| 397 | repo = await _db_repo(db_session) |
| 398 | ids = [_cid() for _ in range(7)] |
| 399 | # Chain: 0β1β2β3β4 (reachable); 5, 6 (orphans) |
| 400 | for i, cid in enumerate(ids[:5]): |
| 401 | parents = [ids[i - 1]] if i > 0 else [] |
| 402 | await _db_commit( |
| 403 | db_session, repo.repo_id, commit_id=cid, parent_ids=parents |
| 404 | ) |
| 405 | for cid in ids[5:]: |
| 406 | await _db_commit(db_session, repo.repo_id, commit_id=cid) |
| 407 | |
| 408 | await _db_branch(db_session, repo.repo_id, head_commit_id=ids[4]) |
| 409 | await db_session.commit() |
| 410 | |
| 411 | result = await run_gc(db_session, repo.repo_id) |
| 412 | |
| 413 | assert result.reachable_commit_count == 5 |
| 414 | assert result.commits_deleted == 2 |
| 415 | |
| 416 | for cid in ids[:5]: |
| 417 | assert await db_session.get(MusehubCommit, cid) is not None |
| 418 | for cid in ids[5:]: |
| 419 | assert await db_session.get(MusehubCommit, cid) is None |
| 420 | |
| 421 | async def test_e2e_gc_empty_repo(self, db_session: AsyncSession) -> None: |
| 422 | repo = await _db_repo(db_session) |
| 423 | await db_session.commit() |
| 424 | |
| 425 | result = await run_gc(db_session, repo.repo_id) |
| 426 | assert result.commits_deleted == 0 |
| 427 | assert result.snapshots_deleted == 0 |
| 428 | |
| 429 | async def test_e2e_gc_snapshot_lifecycle(self, db_session: AsyncSession) -> None: |
| 430 | repo = await _db_repo(db_session) |
| 431 | live_snap = await _db_snapshot(db_session, repo.repo_id) |
| 432 | dead_snap = await _db_snapshot(db_session, repo.repo_id) |
| 433 | |
| 434 | live_cid = _cid() |
| 435 | dead_cid = _cid() |
| 436 | await _db_commit( |
| 437 | db_session, repo.repo_id, commit_id=live_cid, snapshot_id=live_snap.snapshot_id |
| 438 | ) |
| 439 | await _db_commit( |
| 440 | db_session, repo.repo_id, commit_id=dead_cid, snapshot_id=dead_snap.snapshot_id |
| 441 | ) |
| 442 | await _db_branch(db_session, repo.repo_id, head_commit_id=live_cid) |
| 443 | await db_session.commit() |
| 444 | |
| 445 | result = await run_gc(db_session, repo.repo_id) |
| 446 | assert result.commits_deleted == 1 |
| 447 | assert result.snapshots_deleted == 1 |
| 448 | |
| 449 | assert await db_session.get(MusehubSnapshot, live_snap.snapshot_id) is not None |
| 450 | assert await db_session.get(MusehubSnapshot, dead_snap.snapshot_id) is None |
| 451 | |
| 452 | |
| 453 | # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| 454 | # Layer 4 β Stress |
| 455 | # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| 456 | |
| 457 | |
| 458 | class TestStressGC: |
| 459 | async def test_gc_large_orphan_set(self, db_session: AsyncSession) -> None: |
| 460 | """GC must handle a repo with 200 orphaned commits.""" |
| 461 | repo = await _db_repo(db_session) |
| 462 | live = await _db_commit(db_session, repo.repo_id) |
| 463 | await _db_branch(db_session, repo.repo_id, head_commit_id=live.commit_id) |
| 464 | |
| 465 | for _ in range(200): |
| 466 | await _db_commit(db_session, repo.repo_id) |
| 467 | |
| 468 | await db_session.commit() |
| 469 | |
| 470 | result = await run_gc(db_session, repo.repo_id) |
| 471 | assert result.commits_deleted == 200 |
| 472 | assert result.reachable_commit_count == 1 |
| 473 | |
| 474 | async def test_gc_deep_chain(self, db_session: AsyncSession) -> None: |
| 475 | """GC must traverse a 100-commit linear chain without stack overflow.""" |
| 476 | repo = await _db_repo(db_session) |
| 477 | ids = [_cid() for _ in range(100)] |
| 478 | for i, cid in enumerate(ids): |
| 479 | parents = [ids[i - 1]] if i > 0 else [] |
| 480 | await _db_commit( |
| 481 | db_session, repo.repo_id, commit_id=cid, parent_ids=parents |
| 482 | ) |
| 483 | await _db_branch(db_session, repo.repo_id, head_commit_id=ids[-1]) |
| 484 | await db_session.commit() |
| 485 | |
| 486 | result = await run_gc(db_session, repo.repo_id) |
| 487 | assert result.commits_deleted == 0 |
| 488 | assert result.reachable_commit_count == 100 |
| 489 | |
| 490 | async def test_gc_many_orphaned_snapshots(self, db_session: AsyncSession) -> None: |
| 491 | repo = await _db_repo(db_session) |
| 492 | live = await _db_commit(db_session, repo.repo_id) |
| 493 | await _db_branch(db_session, repo.repo_id, head_commit_id=live.commit_id) |
| 494 | |
| 495 | for _ in range(50): |
| 496 | snap = await _db_snapshot(db_session, repo.repo_id) |
| 497 | await _db_commit(db_session, repo.repo_id, snapshot_id=snap.snapshot_id) |
| 498 | |
| 499 | await db_session.commit() |
| 500 | |
| 501 | result = await run_gc(db_session, repo.repo_id) |
| 502 | assert result.commits_deleted == 50 |
| 503 | assert result.snapshots_deleted == 50 |
| 504 | |
| 505 | async def test_gc_idempotent_on_clean_repo(self, db_session: AsyncSession) -> None: |
| 506 | """Running GC twice on an already-clean repo must be a no-op both times.""" |
| 507 | repo = await _db_repo(db_session) |
| 508 | c = await _db_commit(db_session, repo.repo_id) |
| 509 | await _db_branch(db_session, repo.repo_id, head_commit_id=c.commit_id) |
| 510 | await db_session.commit() |
| 511 | |
| 512 | r1 = await run_gc(db_session, repo.repo_id) |
| 513 | r2 = await run_gc(db_session, repo.repo_id) |
| 514 | |
| 515 | assert r1.commits_deleted == 0 |
| 516 | assert r2.commits_deleted == 0 |
| 517 | |
| 518 | |
| 519 | # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| 520 | # Layer 5 β Data Integrity |
| 521 | # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| 522 | |
| 523 | |
| 524 | class TestDataIntegrityGC: |
| 525 | async def test_gc_does_not_delete_head_commit(self, db_session: AsyncSession) -> None: |
| 526 | repo = await _db_repo(db_session) |
| 527 | head = await _db_commit(db_session, repo.repo_id) |
| 528 | await _db_branch(db_session, repo.repo_id, head_commit_id=head.commit_id) |
| 529 | await db_session.commit() |
| 530 | |
| 531 | await run_gc(db_session, repo.repo_id) |
| 532 | row = await db_session.get(MusehubCommit, head.commit_id) |
| 533 | assert row is not None |
| 534 | |
| 535 | async def test_gc_does_not_delete_live_snapshot(self, db_session: AsyncSession) -> None: |
| 536 | repo = await _db_repo(db_session) |
| 537 | snap = await _db_snapshot(db_session, repo.repo_id) |
| 538 | c = await _db_commit(db_session, repo.repo_id, snapshot_id=snap.snapshot_id) |
| 539 | await _db_branch(db_session, repo.repo_id, head_commit_id=c.commit_id) |
| 540 | await db_session.commit() |
| 541 | |
| 542 | await run_gc(db_session, repo.repo_id) |
| 543 | snap_row = await db_session.get(MusehubSnapshot, snap.snapshot_id) |
| 544 | assert snap_row is not None |
| 545 | |
| 546 | async def test_gc_counts_match_actual_deletes(self, db_session: AsyncSession) -> None: |
| 547 | repo = await _db_repo(db_session) |
| 548 | live = await _db_commit(db_session, repo.repo_id) |
| 549 | snaps = [await _db_snapshot(db_session, repo.repo_id) for _ in range(3)] |
| 550 | for snap in snaps: |
| 551 | await _db_commit(db_session, repo.repo_id, snapshot_id=snap.snapshot_id) |
| 552 | await _db_branch(db_session, repo.repo_id, head_commit_id=live.commit_id) |
| 553 | await db_session.commit() |
| 554 | |
| 555 | result = await run_gc(db_session, repo.repo_id) |
| 556 | assert result.commits_deleted == 3 |
| 557 | assert result.snapshots_deleted == 3 |
| 558 | |
| 559 | # Verify actual DB state matches reported counts |
| 560 | remaining_commits = await db_session.execute( |
| 561 | select(MusehubCommit) |
| 562 | .join(MusehubCommitRef, MusehubCommitRef.commit_id == MusehubCommit.commit_id) |
| 563 | .where(MusehubCommitRef.repo_id == repo.repo_id) |
| 564 | ) |
| 565 | assert len(remaining_commits.scalars().all()) == 1 |
| 566 | |
| 567 | remaining_snaps = await db_session.execute( |
| 568 | select(MusehubSnapshot) |
| 569 | .join(MusehubSnapshotRef, MusehubSnapshotRef.snapshot_id == MusehubSnapshot.snapshot_id) |
| 570 | .where(MusehubSnapshotRef.repo_id == repo.repo_id) |
| 571 | ) |
| 572 | assert len(remaining_snaps.scalars().all()) == 0 |
| 573 | |
| 574 | async def test_gc_merge_commit_both_parents_preserved( |
| 575 | self, db_session: AsyncSession |
| 576 | ) -> None: |
| 577 | repo = await _db_repo(db_session) |
| 578 | base_cid = _cid() |
| 579 | left_cid = _cid() |
| 580 | right_cid = _cid() |
| 581 | merge_cid = _cid() |
| 582 | |
| 583 | await _db_commit(db_session, repo.repo_id, commit_id=base_cid) |
| 584 | await _db_commit( |
| 585 | db_session, repo.repo_id, commit_id=left_cid, parent_ids=[base_cid] |
| 586 | ) |
| 587 | await _db_commit( |
| 588 | db_session, repo.repo_id, commit_id=right_cid, parent_ids=[base_cid] |
| 589 | ) |
| 590 | await _db_commit( |
| 591 | db_session, repo.repo_id, |
| 592 | commit_id=merge_cid, parent_ids=[left_cid, right_cid] |
| 593 | ) |
| 594 | await _db_branch(db_session, repo.repo_id, head_commit_id=merge_cid) |
| 595 | await db_session.commit() |
| 596 | |
| 597 | result = await run_gc(db_session, repo.repo_id) |
| 598 | assert result.commits_deleted == 0 |
| 599 | assert result.reachable_commit_count == 4 |
| 600 | |
| 601 | for cid in [base_cid, left_cid, right_cid, merge_cid]: |
| 602 | assert await db_session.get(MusehubCommit, cid) is not None |
| 603 | |
| 604 | async def test_gc_commit_with_no_snapshot_skips_snapshot_delete( |
| 605 | self, db_session: AsyncSession |
| 606 | ) -> None: |
| 607 | repo = await _db_repo(db_session) |
| 608 | live = await _db_commit(db_session, repo.repo_id) |
| 609 | # orphan has no snapshot |
| 610 | await _db_commit(db_session, repo.repo_id, snapshot_id=None) |
| 611 | await _db_branch(db_session, repo.repo_id, head_commit_id=live.commit_id) |
| 612 | await db_session.commit() |
| 613 | |
| 614 | result = await run_gc(db_session, repo.repo_id) |
| 615 | assert result.commits_deleted == 1 |
| 616 | assert result.snapshots_deleted == 0 |
| 617 | |
| 618 | |
| 619 | # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| 620 | # Layer 6 β Security |
| 621 | # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| 622 | |
| 623 | |
| 624 | class TestSecurityGC: |
| 625 | async def test_gc_does_not_cross_repo_boundaries(self, db_session: AsyncSession) -> None: |
| 626 | """GC for repo1 must never delete commits belonging to repo2.""" |
| 627 | repo1 = await _db_repo(db_session) |
| 628 | repo2 = await _db_repo(db_session) |
| 629 | |
| 630 | # repo2 has a commit not referenced by any branch |
| 631 | r2_commit = await _db_commit(db_session, repo2.repo_id) |
| 632 | |
| 633 | # repo1 has a live commit |
| 634 | r1_live = await _db_commit(db_session, repo1.repo_id) |
| 635 | await _db_branch(db_session, repo1.repo_id, head_commit_id=r1_live.commit_id) |
| 636 | await _db_branch(db_session, repo2.repo_id, head_commit_id=None) |
| 637 | await db_session.commit() |
| 638 | |
| 639 | await run_gc(db_session, repo1.repo_id) |
| 640 | |
| 641 | # repo2's commit must still be there |
| 642 | row = await db_session.get(MusehubCommit, r2_commit.commit_id) |
| 643 | assert row is not None |
| 644 | |
| 645 | async def test_gc_nonexistent_repo_returns_empty_result( |
| 646 | self, db_session: AsyncSession |
| 647 | ) -> None: |
| 648 | """Calling GC on a non-existent repo_id must not raise and return empty result.""" |
| 649 | result = await run_gc(db_session, "nonexistent-repo-id") |
| 650 | assert result.commits_deleted == 0 |
| 651 | assert result.reachable_commit_count == 0 |
| 652 | |
| 653 | async def test_gc_with_unknown_parent_ids_does_not_crash( |
| 654 | self, db_session: AsyncSession |
| 655 | ) -> None: |
| 656 | """Commits that reference parent IDs not in the DB (dangling refs) are handled.""" |
| 657 | repo = await _db_repo(db_session) |
| 658 | head_cid = _cid() |
| 659 | # parent_ids references a commit that doesn't exist in DB |
| 660 | await _db_commit( |
| 661 | db_session, repo.repo_id, |
| 662 | commit_id=head_cid, parent_ids=["phantom-commit-id-not-in-db"] |
| 663 | ) |
| 664 | await _db_branch(db_session, repo.repo_id, head_commit_id=head_cid) |
| 665 | await db_session.commit() |
| 666 | |
| 667 | # BFS encounters unknown parent, skips it β must not raise |
| 668 | result = await run_gc(db_session, repo.repo_id) |
| 669 | assert result.commits_deleted == 0 |
| 670 | assert result.reachable_commit_count == 1 |
| 671 | |
| 672 | |
| 673 | # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| 674 | # Layer 7 β Performance |
| 675 | # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| 676 | |
| 677 | |
| 678 | class TestPerformanceGC: |
| 679 | async def test_gc_completes_quickly_small_repo(self, db_session: AsyncSession) -> None: |
| 680 | import time |
| 681 | |
| 682 | repo = await _db_repo(db_session) |
| 683 | ids = [_cid() for _ in range(20)] |
| 684 | for i, cid in enumerate(ids): |
| 685 | parents = [ids[i - 1]] if i > 0 else [] |
| 686 | await _db_commit(db_session, repo.repo_id, commit_id=cid, parent_ids=parents) |
| 687 | await _db_branch(db_session, repo.repo_id, head_commit_id=ids[-1]) |
| 688 | await db_session.commit() |
| 689 | |
| 690 | start = time.perf_counter() |
| 691 | result = await run_gc(db_session, repo.repo_id) |
| 692 | elapsed = time.perf_counter() - start |
| 693 | |
| 694 | assert result.commits_deleted == 0 |
| 695 | assert elapsed < 1.0 |
| 696 | |
| 697 | async def test_gc_with_mixed_load(self, db_session: AsyncSession) -> None: |
| 698 | import time |
| 699 | |
| 700 | repo = await _db_repo(db_session) |
| 701 | live_ids = [_cid() for _ in range(30)] |
| 702 | for i, cid in enumerate(live_ids): |
| 703 | parents = [live_ids[i - 1]] if i > 0 else [] |
| 704 | await _db_commit(db_session, repo.repo_id, commit_id=cid, parent_ids=parents) |
| 705 | |
| 706 | for _ in range(50): |
| 707 | snap = await _db_snapshot(db_session, repo.repo_id) |
| 708 | await _db_commit(db_session, repo.repo_id, snapshot_id=snap.snapshot_id) |
| 709 | |
| 710 | await _db_branch(db_session, repo.repo_id, head_commit_id=live_ids[-1]) |
| 711 | await db_session.commit() |
| 712 | |
| 713 | start = time.perf_counter() |
| 714 | result = await run_gc(db_session, repo.repo_id) |
| 715 | elapsed = time.perf_counter() - start |
| 716 | |
| 717 | assert result.commits_deleted == 50 |
| 718 | assert result.snapshots_deleted == 50 |
| 719 | assert elapsed < 2.0 |