gabriel / musehub public
test_blob_only_object_store.py python
269 lines 10.7 KB
Raw
sha256:94ef169c149a452bff7c604ded8b280b19bd477c2dabcb56972780b0b784c7aa Merge 'fix/assignee-sigil-inline' into 'dev' — proposal: As… Human 1 day ago
1 """TDD — S3/MinIO is for blobs only. Commits and snapshots live in the DB.
2
3 Architecture:
4 - Blobs: MinIO mpack:// (byte-range indexed via musehub_mpack_index)
5 - Commits: DB only (musehub_commits) — no individual S3 writes
6 - Snapshots: DB only (musehub_snapshots, manifest_blob cache) — no individual S3 writes
7
8 Phase 2/3 added individual S3 writes for commits and snapshots. This was
9 wrong — commits/snapshots are structured metadata that belongs in the DB.
10 Writing them to S3 individually creates triple-storage and S3-read overhead
11 on the hot serving path when the DB already has the data faster.
12
13 Blobs stay in the covering mpack (mpack:// URI). Commits and snapshots have
14 storage_uri=NULL (not written to S3 individually).
15
16 Tests:
17 BOS-1 push → commit storage_uri is NULL (no individual S3 write)
18 BOS-2 push → snapshot storage_uri is NULL (no individual S3 write)
19 BOS-3 push → blob storage_uri is mpack:// (still correct)
20 BOS-4 wire_fetch_mpack serves commits from DB, not S3 backend.get()
21 BOS-5 wire_fetch_mpack serves snapshots from DB, not S3 backend.get()
22 BOS-6 merge_proposal → merge commit storage_uri is NULL
23 """
24 from __future__ import annotations
25
26 import datetime
27 import pytest
28 from sqlalchemy import select
29 from sqlalchemy.ext.asyncio import AsyncSession
30 from unittest.mock import AsyncMock, MagicMock, patch, call
31
32 from muse.core.types import fake_id, blob_id
33 from musehub.core.genesis import compute_identity_id, compute_repo_id
34 from musehub.db.musehub_repo_models import (
35 MusehubBranch, MusehubCommit, MusehubCommitRef, MusehubCommitGraph,
36 MusehubObject, MusehubRepo, MusehubSnapshot, MusehubSnapshotRef,
37 )
38
39
40 def _uid() -> str:
41 import secrets
42 return secrets.token_hex(6)
43
44
45 async def _make_repo(session: AsyncSession, slug: str) -> MusehubRepo:
46 now = datetime.datetime.now(tz=datetime.timezone.utc)
47 owner_id = compute_identity_id(b"gabriel")
48 repo = MusehubRepo(
49 repo_id=compute_repo_id(owner_id, slug, "code", now.isoformat()),
50 name=slug, owner="gabriel", slug=slug, visibility="public",
51 owner_user_id=owner_id, created_at=now, updated_at=now,
52 )
53 session.add(repo)
54 await session.flush()
55 return repo
56
57
58 def _mock_backend() -> MagicMock:
59 b = MagicMock()
60 b.put = AsyncMock(return_value="s3://muse-objects/fake")
61 b.get = AsyncMock(return_value=None)
62 b.get_mpack = AsyncMock(return_value=None)
63 b.put_mpack = AsyncMock(return_value=None)
64 b.presign_mpack_get = AsyncMock(return_value="https://minio.example.com/mpack?sig=x")
65 return b
66
67
68 # ---------------------------------------------------------------------------
69 # BOS-1 commit storage_uri is NULL after push
70 # ---------------------------------------------------------------------------
71
72 @pytest.mark.asyncio
73 async def test_BOS1_commit_storage_uri_is_null_after_push(
74 db_session: AsyncSession,
75 ) -> None:
76 """Commits must NOT be written to S3 individually — storage_uri stays NULL."""
77 from musehub.services.musehub_sync import commit_files_to_repo
78
79 repo = await _make_repo(db_session, f"bos-commit-{_uid()}")
80 backend = _mock_backend()
81
82 with patch("musehub.storage.backends.get_backend", return_value=backend):
83 commit_id = await commit_files_to_repo(
84 db_session, repo_id=repo.repo_id, branch="main",
85 files={"a.py": b"x=1"}, message="test", author="gabriel",
86 )
87
88 commit_row = await db_session.get(MusehubCommit, commit_id)
89 assert commit_row is not None
90
91 assert commit_row.storage_uri is None, (
92 f"Commits must NOT be written to S3. Got storage_uri={commit_row.storage_uri!r}. "
93 "Commits belong in the DB only — S3 is for blobs."
94 )
95
96 # Verify backend.put was NOT called with a commit key
97 for c in backend.put.call_args_list:
98 assert commit_id not in str(c), (
99 "backend.put must not be called for commit objects"
100 )
101
102
103 # ---------------------------------------------------------------------------
104 # BOS-2 snapshot storage_uri is NULL after push
105 # ---------------------------------------------------------------------------
106
107 @pytest.mark.asyncio
108 async def test_BOS2_snapshot_storage_uri_is_null_after_push(
109 db_session: AsyncSession,
110 ) -> None:
111 """Snapshots must NOT be written to S3 individually — storage_uri stays NULL."""
112 from musehub.services.musehub_sync import commit_files_to_repo
113 from musehub.db.musehub_repo_models import MusehubCommit as DbCommit
114
115 repo = await _make_repo(db_session, f"bos-snap-{_uid()}")
116 backend = _mock_backend()
117
118 with patch("musehub.storage.backends.get_backend", return_value=backend):
119 commit_id = await commit_files_to_repo(
120 db_session, repo_id=repo.repo_id, branch="main",
121 files={"b.py": b"y=2"}, message="test", author="gabriel",
122 )
123
124 commit_row = await db_session.get(DbCommit, commit_id)
125 snap_row = await db_session.get(MusehubSnapshot, commit_row.snapshot_id)
126 assert snap_row is not None
127
128 assert snap_row.storage_uri is None, (
129 f"Snapshots must NOT be written to S3. Got storage_uri={snap_row.storage_uri!r}. "
130 "Snapshots belong in the DB only — S3 is for blobs."
131 )
132
133
134 # ---------------------------------------------------------------------------
135 # BOS-3 blob storage_uri is mpack:// after push receive
136 # ---------------------------------------------------------------------------
137
138 @pytest.mark.asyncio
139 async def test_BOS3_blob_storage_uri_is_mpack_after_push(
140 db_session: AsyncSession,
141 ) -> None:
142 """Blobs must still get mpack:// storage_uri — byte-range served from MinIO."""
143 from muse.core.mpack import build_wire_mpack
144 from musehub.services.musehub_wire_push import wire_push_unpack_mpack
145
146 repo = await _make_repo(db_session, f"bos-blob-{_uid()}")
147 content = b"blob content for bos test"
148 oid = blob_id(content)
149 wire = build_wire_mpack({
150 "blobs": [{"object_id": oid, "content": content}],
151 "commits": [], "snapshots": [],
152 })
153 mpack_key = blob_id(wire)
154
155 backend = _mock_backend()
156 backend.get_mpack = AsyncMock(return_value=wire)
157
158 with patch("musehub.storage.backends.get_backend", return_value=backend):
159 await wire_push_unpack_mpack(
160 db_session, repo.repo_id, mpack_key, "gabriel",
161 branch="main", head_commit_id="", commits_count=0, blobs_count=1,
162 )
163
164 obj_row = await db_session.get(MusehubObject, oid)
165 assert obj_row is not None, "Blob must be in musehub_objects"
166 assert obj_row.storage_uri and obj_row.storage_uri.startswith("mpack://"), (
167 f"Blob storage_uri must be mpack://, got {obj_row.storage_uri!r}"
168 )
169
170
171 # ---------------------------------------------------------------------------
172 # BOS-4 wire_fetch serves commits from DB, no S3 backend.get()
173 # ---------------------------------------------------------------------------
174
175 @pytest.mark.asyncio
176 async def test_BOS4_commits_served_from_db_not_s3(
177 db_session: AsyncSession,
178 ) -> None:
179 """wire_fetch_mpack must serve commits from DB rows, not S3 backend.get()."""
180 from musehub.services.musehub_wire_fetch import wire_fetch_mpack
181
182 repo = await _make_repo(db_session, f"bos-fetch-commit-{_uid()}")
183 now = datetime.datetime.now(tz=datetime.timezone.utc)
184 snap_id = fake_id("snap-bos4")
185 commit_id = fake_id("commit-bos4")
186
187 db_session.add(MusehubSnapshot(
188 snapshot_id=snap_id, manifest_blob=b"\x80", entry_count=0, directories=[],
189 storage_uri=None,
190 ))
191 db_session.add(MusehubSnapshotRef(repo_id=repo.repo_id, snapshot_id=snap_id))
192 db_session.add(MusehubCommit(
193 commit_id=commit_id, branch="main", parent_ids=[], message="bos4",
194 author="gabriel", timestamp=now, snapshot_id=snap_id, storage_uri=None,
195 ))
196 db_session.add(MusehubCommitRef(repo_id=repo.repo_id, commit_id=commit_id))
197 from musehub.db.musehub_repo_models import MusehubCommitGraph
198 db_session.add(MusehubCommitGraph(
199 commit_id=commit_id, parent_ids=[], generation=0, snapshot_id=snap_id,
200 ))
201 await db_session.flush()
202
203 backend = _mock_backend()
204
205 with patch("musehub.services.musehub_wire_fetch.get_backend", return_value=backend), \
206 patch("musehub.storage.backends.get_backend", return_value=backend):
207 result = await wire_fetch_mpack(
208 db_session, repo.repo_id, want=[commit_id], have=[]
209 )
210
211 # backend.get must NOT be called with a commit_id — commits come from DB
212 get_calls = [str(c) for c in backend.get.call_args_list]
213 commit_s3_reads = [c for c in get_calls if commit_id in c]
214 assert not commit_s3_reads, (
215 "wire_fetch_mpack must NOT call backend.get() for commits. "
216 f"Got S3 reads: {commit_s3_reads}. Commits are served from DB."
217 )
218
219
220 # ---------------------------------------------------------------------------
221 # BOS-6 merge_proposal commit storage_uri is NULL
222 # ---------------------------------------------------------------------------
223
224 @pytest.mark.asyncio
225 async def test_BOS6_merge_proposal_commit_storage_uri_is_null(
226 db_session: AsyncSession,
227 ) -> None:
228 """Merge commits created server-side must NOT have individual S3 storage_uri."""
229 from musehub.services import musehub_proposals
230
231 repo = await _make_repo(db_session, f"bos-merge-{_uid()}")
232 now = datetime.datetime.now(tz=datetime.timezone.utc)
233 snap_id = fake_id("snap-merge-bos")
234 commit_id = fake_id("commit-merge-bos")
235
236 db_session.add(MusehubSnapshot(
237 snapshot_id=snap_id, manifest_blob=b"\x80", entry_count=0, directories=[],
238 ))
239 db_session.add(MusehubSnapshotRef(repo_id=repo.repo_id, snapshot_id=snap_id))
240 db_session.add(MusehubCommit(
241 commit_id=commit_id, branch="feat", parent_ids=[], message="feat commit",
242 author="gabriel", timestamp=now, snapshot_id=snap_id,
243 ))
244 db_session.add(MusehubCommitRef(repo_id=repo.repo_id, commit_id=commit_id))
245 from musehub.core.genesis import compute_branch_id
246 db_session.add(MusehubBranch(
247 branch_id=compute_branch_id(repo.repo_id, "feat"),
248 repo_id=repo.repo_id, name="feat", head_commit_id=commit_id,
249 ))
250 await db_session.flush()
251
252 proposal = await musehub_proposals.create_proposal(
253 db_session, repo_id=repo.repo_id,
254 title="Merge feat", from_branch="feat", to_branch="main",
255 )
256
257 backend = _mock_backend()
258 with patch("musehub.storage.backends.get_backend", return_value=backend):
259 merged = await musehub_proposals.merge_proposal(
260 db_session, repo.repo_id, proposal.proposal_id
261 )
262
263 merge_commit = await db_session.get(MusehubCommit, merged.merge_commit_id)
264 assert merge_commit is not None
265
266 assert merge_commit.storage_uri is None, (
267 f"Merge commit must NOT be written to S3. Got storage_uri={merge_commit.storage_uri!r}. "
268 "Commits belong in DB only."
269 )
File History 1 commit
sha256:94ef169c149a452bff7c604ded8b280b19bd477c2dabcb56972780b0b784c7aa Merge 'fix/assignee-sigil-inline' into 'dev' — proposal: As… Human 1 day ago