gabriel / musehub public

test_gc_object_refs.py file-level

at sha256:3 · View file ↗ · Intel ↗

History
1 files
1 commits
0 hotspots
0 🧊 dead
0 💥 blast risk
sha256:0 fix: fall back to any indexed mpack in read_object_bytes when push mpac… · gabriel · Jun 17, 2026
1 """TDD: GC object-ref pruning and global object cleanup.
2
3 After a force push or branch rewrite, old commits and their snapshots become
4 orphaned. Objects referenced exclusively by orphaned snapshots should have
5 their ref rows removed. Objects with zero remaining refs across all repos
6 should be deleted from musehub_objects and from storage.
7
8 Coverage matrix:
9 1. GC deletes ref row for object unreachable from all live snapshots.
10 2. GC does NOT delete ref when object is still referenced by a live snapshot.
11 3. GC deletes musehub_objects row when no refs remain globally.
12 4. GC does NOT delete musehub_objects row when another repo still holds a ref.
13 5. GC on a clean repo (no orphaned commits) is a no-op — no refs disturbed.
14 6. GCResult fields are populated correctly.
15 """
16 from __future__ import annotations
17
18 import secrets
19 from datetime import datetime, timezone
20
21 import msgpack
22 import pytest
23 import sqlalchemy as sa
24 from sqlalchemy.ext.asyncio import AsyncSession
25
26 from muse.core.types import blob_id
27 from musehub.db.musehub_repo_models import MusehubBranch, MusehubCommit, MusehubCommitRef, MusehubObject, MusehubObjectRef, MusehubSnapshot, MusehubSnapshotRef
28 from musehub.services.musehub_gc import run_gc
29 from musehub.types.json_types import StrDict
30 from tests.factories import create_repo
31
32
33 def _now() -> datetime:
34 return datetime.now(tz=timezone.utc)
35
36
37 def _oid(seed: str) -> str:
38 return blob_id(seed.encode())
39
40
41 def _manifest(mapping: StrDict) -> bytes:
42 """Encode a {path: object_id} dict as msgpack."""
43 return msgpack.packb(mapping, use_bin_type=True)
44
45
46 # ---------------------------------------------------------------------------
47 # Low-level DB helpers
48 # ---------------------------------------------------------------------------
49
50 async def _insert_object(session: AsyncSession, oid: str, repo_id: str) -> None:
51 """Insert a minimal musehub_objects row and ref (skips if already present)."""
52 exists = (await session.execute(
53 sa.select(MusehubObject.object_id).where(MusehubObject.object_id == oid)
54 )).scalar_one_or_none()
55 if exists:
56 return
57 obj = MusehubObject(
58 object_id=oid,
59 size_bytes=10,
60 path="test.md",
61 content_cache=b"test content",
62 )
63 session.add(obj)
64 await session.flush()
65
66
67 async def _insert_ref(session: AsyncSession, repo_id: str, oid: str) -> None:
68 """Insert a musehub_object_refs row (idempotent)."""
69 existing = (await session.execute(
70 sa.select(MusehubObjectRef).where(
71 MusehubObjectRef.repo_id == repo_id,
72 MusehubObjectRef.object_id == oid,
73 )
74 )).scalar_one_or_none()
75 if existing:
76 return
77 session.add(MusehubObjectRef(repo_id=repo_id, object_id=oid))
78 await session.flush()
79
80
81 async def _insert_snapshot(
82 session: AsyncSession,
83 snapshot_id: str,
84 manifest: dict[str, str],
85 repo_id: str = "",
86 ) -> MusehubSnapshot:
87 snap = MusehubSnapshot(
88 snapshot_id=snapshot_id,
89 manifest_blob=_manifest(manifest),
90 entry_count=len(manifest),
91 directories=[],
92 )
93 session.add(snap)
94 if repo_id:
95 session.add(MusehubSnapshotRef(repo_id=repo_id, snapshot_id=snapshot_id))
96 await session.flush()
97 return snap
98
99
100 async def _insert_commit(
101 session: AsyncSession,
102 repo_id: str,
103 commit_id: str,
104 snapshot_id: str | None = None,
105 parent_ids: list[str] | None = None,
106 branch: str = "main",
107 ) -> MusehubCommit:
108 commit = MusehubCommit(
109 commit_id=commit_id,
110 message="test commit",
111 author="test-user",
112 branch=branch,
113 parent_ids=parent_ids or [],
114 snapshot_id=snapshot_id,
115 timestamp=_now(),
116 )
117 session.add(commit)
118 session.add(MusehubCommitRef(repo_id=repo_id, commit_id=commit_id))
119 await session.flush()
120 return commit
121
122
123 async def _insert_branch(
124 session: AsyncSession,
125 repo_id: str,
126 head_commit_id: str,
127 name: str = "main",
128 ) -> MusehubBranch:
129 from musehub.core.genesis import compute_branch_id
130 branch = MusehubBranch(
131 branch_id=compute_branch_id(repo_id, name),
132 repo_id=repo_id,
133 name=name,
134 head_commit_id=head_commit_id,
135 )
136 session.add(branch)
137 await session.flush()
138 return branch
139
140
141 async def _ref_exists(session: AsyncSession, repo_id: str, oid: str) -> bool:
142 row = (await session.execute(
143 sa.select(MusehubObjectRef).where(
144 MusehubObjectRef.repo_id == repo_id,
145 MusehubObjectRef.object_id == oid,
146 )
147 )).scalar_one_or_none()
148 return row is not None
149
150
151 async def _object_exists(session: AsyncSession, oid: str) -> bool:
152 row = (await session.execute(
153 sa.select(MusehubObject.object_id).where(MusehubObject.object_id == oid)
154 )).scalar_one_or_none()
155 return row is not None
156
157
158 # ---------------------------------------------------------------------------
159 # Test 1: GC removes ref for object only in orphaned snapshot
160 # ---------------------------------------------------------------------------
161
162 @pytest.mark.asyncio
163 async def test_gc_removes_stale_ref_for_orphaned_object(
164 db_session: AsyncSession,
165 ) -> None:
166 """A ref row for an object that only appears in an orphaned snapshot must be deleted."""
167 repo = await create_repo(db_session, slug="gc-stale-ref", owner="test-user-wire")
168 oid = _oid("stale-object-only-in-orphaned-snapshot")
169
170 # Orphaned commit chain: C1 -> C2 (orphaned after force-push to C3)
171 snap_orphan_id = f"snap_{secrets.token_hex(4)}"
172 c1_id = secrets.token_hex(16)
173 c2_id = secrets.token_hex(16)
174 c3_id = secrets.token_hex(16)
175 snap_live_id = f"snap_{secrets.token_hex(4)}"
176
177 await _insert_object(db_session, oid, repo.repo_id)
178 await _insert_ref(db_session, repo.repo_id, oid)
179 # Orphaned snapshot references the object
180 await _insert_snapshot(db_session, snap_orphan_id, {"file.md": oid}, repo_id=repo.repo_id)
181 # Live snapshot is empty (object not referenced by any live snapshot)
182 await _insert_snapshot(db_session, snap_live_id, {}, repo_id=repo.repo_id)
183 await _insert_commit(db_session, repo.repo_id, c1_id)
184 await _insert_commit(db_session, repo.repo_id, c2_id, snapshot_id=snap_orphan_id, parent_ids=[c1_id])
185 await _insert_commit(db_session, repo.repo_id, c3_id, snapshot_id=snap_live_id) # force-push resets branch
186 await _insert_branch(db_session, repo.repo_id, c3_id)
187 await db_session.commit()
188
189 result = await run_gc(db_session, repo.repo_id)
190
191 assert result.object_refs_deleted >= 1, "stale ref must be deleted"
192 assert not await _ref_exists(db_session, repo.repo_id, oid), \
193 "ref row must be gone after GC"
194
195
196 # ---------------------------------------------------------------------------
197 # Test 2: GC keeps ref when object is still in a live snapshot
198 # ---------------------------------------------------------------------------
199
200 @pytest.mark.asyncio
201 async def test_gc_keeps_ref_for_live_object(
202 db_session: AsyncSession,
203 ) -> None:
204 """A ref for an object that appears in both orphaned and live snapshots must survive."""
205 repo = await create_repo(db_session, slug="gc-live-ref", owner="test-user-wire")
206 oid = _oid("object-in-both-orphaned-and-live-snapshot")
207
208 snap_orphan_id = f"snap_{secrets.token_hex(4)}"
209 snap_live_id = f"snap_{secrets.token_hex(4)}"
210 c_orphan_id = secrets.token_hex(16)
211 c_live_id = secrets.token_hex(16)
212
213 await _insert_object(db_session, oid, repo.repo_id)
214 await _insert_ref(db_session, repo.repo_id, oid)
215 # Both snapshots reference the same object
216 await _insert_snapshot(db_session, snap_orphan_id, {"a.md": oid}, repo_id=repo.repo_id)
217 await _insert_snapshot(db_session, snap_live_id, {"b.md": oid}, repo_id=repo.repo_id)
218 await _insert_commit(db_session, repo.repo_id, c_orphan_id, snapshot_id=snap_orphan_id)
219 await _insert_commit(db_session, repo.repo_id, c_live_id, snapshot_id=snap_live_id)
220 await _insert_branch(db_session, repo.repo_id, c_live_id)
221 await db_session.commit()
222
223 result = await run_gc(db_session, repo.repo_id)
224
225 assert result.object_refs_deleted == 0, "ref to live object must not be deleted"
226 assert await _ref_exists(db_session, repo.repo_id, oid), \
227 "ref row must survive GC when object is live"
228
229
230 # ---------------------------------------------------------------------------
231 # Test 3: GC deletes musehub_objects row when globally orphaned
232 # ---------------------------------------------------------------------------
233
234 @pytest.mark.asyncio
235 async def test_gc_deletes_globally_orphaned_object(
236 db_session: AsyncSession,
237 ) -> None:
238 """After the last ref is deleted, the musehub_objects row must be deleted too."""
239 repo = await create_repo(db_session, slug="gc-global-orphan", owner="test-user-wire")
240 oid = _oid("globally-orphaned-no-other-repo-refs")
241
242 snap_orphan_id = f"snap_{secrets.token_hex(4)}"
243 snap_live_id = f"snap_{secrets.token_hex(4)}"
244 c_orphan_id = secrets.token_hex(16)
245 c_live_id = secrets.token_hex(16)
246
247 await _insert_object(db_session, oid, repo.repo_id)
248 await _insert_ref(db_session, repo.repo_id, oid)
249 await _insert_snapshot(db_session, snap_orphan_id, {"file.md": oid}, repo_id=repo.repo_id)
250 await _insert_snapshot(db_session, snap_live_id, {}, repo_id=repo.repo_id) # live snapshot has no objects
251 await _insert_commit(db_session, repo.repo_id, c_orphan_id, snapshot_id=snap_orphan_id)
252 await _insert_commit(db_session, repo.repo_id, c_live_id, snapshot_id=snap_live_id)
253 await _insert_branch(db_session, repo.repo_id, c_live_id)
254 await db_session.commit()
255
256 result = await run_gc(db_session, repo.repo_id)
257
258 assert result.objects_deleted >= 1, "globally orphaned object must be deleted from DB"
259 assert not await _object_exists(db_session, oid), \
260 "musehub_objects row must be gone after GC"
261
262
263 # ---------------------------------------------------------------------------
264 # Test 4: GC does NOT delete musehub_objects when another repo still refs it
265 # ---------------------------------------------------------------------------
266
267 @pytest.mark.asyncio
268 async def test_gc_keeps_object_when_other_repo_holds_ref(
269 db_session: AsyncSession,
270 ) -> None:
271 """An object shared with another repo must NOT be deleted from musehub_objects."""
272 repo_a = await create_repo(db_session, slug="gc-shared-a", owner="test-user-wire")
273 repo_b = await create_repo(db_session, slug="gc-shared-b", owner="test-user-wire")
274 oid = _oid("shared-object-two-repos")
275
276 # Set up repo_a with orphaned snapshot referencing the object
277 snap_orphan_id = f"snap_{secrets.token_hex(4)}"
278 snap_live_id = f"snap_{secrets.token_hex(4)}"
279 c_orphan_id = secrets.token_hex(16)
280 c_live_id = secrets.token_hex(16)
281
282 await _insert_object(db_session, oid, repo_a.repo_id)
283 # Both repos hold a ref
284 await _insert_ref(db_session, repo_a.repo_id, oid)
285 await _insert_ref(db_session, repo_b.repo_id, oid)
286
287 await _insert_snapshot(db_session, snap_orphan_id, {"file.md": oid}, repo_id=repo_a.repo_id)
288 await _insert_snapshot(db_session, snap_live_id, {}, repo_id=repo_a.repo_id)
289 await _insert_commit(db_session, repo_a.repo_id, c_orphan_id, snapshot_id=snap_orphan_id)
290 await _insert_commit(db_session, repo_a.repo_id, c_live_id, snapshot_id=snap_live_id)
291 await _insert_branch(db_session, repo_a.repo_id, c_live_id)
292 await db_session.commit()
293
294 # GC repo_a — prunes repo_a's ref but repo_b's ref survives
295 result = await run_gc(db_session, repo_a.repo_id)
296
297 assert result.objects_deleted == 0, \
298 "must not delete object still referenced by repo_b"
299 assert await _object_exists(db_session, oid), \
300 "musehub_objects row must survive because repo_b still holds a ref"
301 assert await _ref_exists(db_session, repo_b.repo_id, oid), \
302 "repo_b ref must be untouched by repo_a GC"
303
304
305 # ---------------------------------------------------------------------------
306 # Test 5: Clean repo (no orphaned commits) — no refs disturbed
307 # ---------------------------------------------------------------------------
308
309 @pytest.mark.asyncio
310 async def test_gc_clean_repo_does_not_touch_refs(
311 db_session: AsyncSession,
312 ) -> None:
313 """GC on a fully-reachable repo must be a no-op for object refs."""
314 repo = await create_repo(db_session, slug="gc-clean-repo", owner="test-user-wire")
315 oid = _oid("clean-repo-live-object")
316
317 snap_id = f"snap_{secrets.token_hex(4)}"
318 c_id = secrets.token_hex(16)
319
320 await _insert_object(db_session, oid, repo.repo_id)
321 await _insert_ref(db_session, repo.repo_id, oid)
322 await _insert_snapshot(db_session, snap_id, {"readme.md": oid}, repo_id=repo.repo_id)
323 await _insert_commit(db_session, repo.repo_id, c_id, snapshot_id=snap_id)
324 await _insert_branch(db_session, repo.repo_id, c_id)
325 await db_session.commit()
326
327 result = await run_gc(db_session, repo.repo_id)
328
329 assert result.commits_deleted == 0
330 assert result.snapshots_deleted == 0
331 assert result.object_refs_deleted == 0
332 assert result.objects_deleted == 0
333
334 assert await _ref_exists(db_session, repo.repo_id, oid), \
335 "live object ref must survive a clean GC run"
336
337
338 # ---------------------------------------------------------------------------
339 # Test 6: GCResult fields are correctly populated
340 # ---------------------------------------------------------------------------
341
342 @pytest.mark.asyncio
343 async def test_gc_result_fields(
344 db_session: AsyncSession,
345 ) -> None:
346 """GCResult must accurately reflect what was deleted."""
347 repo = await create_repo(db_session, slug="gc-result-fields", owner="test-user-wire")
348 oid = _oid("result-fields-object")
349
350 snap_orphan_id = f"snap_{secrets.token_hex(4)}"
351 snap_live_id = f"snap_{secrets.token_hex(4)}"
352 c_orphan_id = secrets.token_hex(16)
353 c_live_id = secrets.token_hex(16)
354
355 await _insert_object(db_session, oid, repo.repo_id)
356 await _insert_ref(db_session, repo.repo_id, oid)
357 await _insert_snapshot(db_session, snap_orphan_id, {"file.md": oid}, repo_id=repo.repo_id)
358 await _insert_snapshot(db_session, snap_live_id, {}, repo_id=repo.repo_id)
359 await _insert_commit(db_session, repo.repo_id, c_orphan_id, snapshot_id=snap_orphan_id)
360 await _insert_commit(db_session, repo.repo_id, c_live_id, snapshot_id=snap_live_id)
361 await _insert_branch(db_session, repo.repo_id, c_live_id)
362 await db_session.commit()
363
364 result = await run_gc(db_session, repo.repo_id)
365
366 assert result.repo_id == repo.repo_id
367 assert result.commits_deleted == 1, "one orphaned commit"
368 assert result.snapshots_deleted == 1, "one orphaned snapshot"
369 assert result.object_refs_deleted == 1, "one stale ref"
370 assert result.objects_deleted == 1, "one globally orphaned object"
371 assert result.reachable_commit_count == 1, "one live commit"
372 # errors list exists even when empty
373 assert isinstance(result.errors, list)