"""TDD — S3/MinIO is for blobs only. Commits and snapshots live in the DB. Architecture: - Blobs: MinIO mpack:// (byte-range indexed via musehub_mpack_index) - Commits: DB only (musehub_commits) — no individual S3 writes - Snapshots: DB only (musehub_snapshots, manifest_blob cache) — no individual S3 writes Phase 2/3 added individual S3 writes for commits and snapshots. This was wrong — commits/snapshots are structured metadata that belongs in the DB. Writing them to S3 individually creates triple-storage and S3-read overhead on the hot serving path when the DB already has the data faster. Blobs stay in the covering mpack (mpack:// URI). Commits and snapshots have storage_uri=NULL (not written to S3 individually). Tests: BOS-1 push → commit storage_uri is NULL (no individual S3 write) BOS-2 push → snapshot storage_uri is NULL (no individual S3 write) BOS-3 push → blob storage_uri is mpack:// (still correct) BOS-4 wire_fetch_mpack serves commits from DB, not S3 backend.get() BOS-5 wire_fetch_mpack serves snapshots from DB, not S3 backend.get() BOS-6 merge_proposal → merge commit storage_uri is NULL """ from __future__ import annotations import datetime import pytest from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession from unittest.mock import AsyncMock, MagicMock, patch, call from muse.core.types import fake_id, blob_id from musehub.core.genesis import compute_identity_id, compute_repo_id from musehub.db.musehub_repo_models import ( MusehubBranch, MusehubCommit, MusehubCommitRef, MusehubCommitGraph, MusehubObject, MusehubRepo, MusehubSnapshot, MusehubSnapshotRef, ) def _uid() -> str: import secrets return secrets.token_hex(6) async def _make_repo(session: AsyncSession, slug: str) -> MusehubRepo: now = datetime.datetime.now(tz=datetime.timezone.utc) owner_id = compute_identity_id(b"gabriel") repo = MusehubRepo( repo_id=compute_repo_id(owner_id, slug, "code", now.isoformat()), name=slug, owner="gabriel", slug=slug, visibility="public", owner_user_id=owner_id, created_at=now, updated_at=now, ) session.add(repo) await session.flush() return repo def _mock_backend() -> MagicMock: b = MagicMock() b.put = AsyncMock(return_value="s3://muse-objects/fake") b.get = AsyncMock(return_value=None) b.get_mpack = AsyncMock(return_value=None) b.put_mpack = AsyncMock(return_value=None) b.presign_mpack_get = AsyncMock(return_value="https://minio.example.com/mpack?sig=x") return b # --------------------------------------------------------------------------- # BOS-1 commit storage_uri is NULL after push # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_BOS1_commit_storage_uri_is_null_after_push( db_session: AsyncSession, ) -> None: """Commits must NOT be written to S3 individually — storage_uri stays NULL.""" from musehub.services.musehub_sync import commit_files_to_repo repo = await _make_repo(db_session, f"bos-commit-{_uid()}") backend = _mock_backend() with patch("musehub.storage.backends.get_backend", return_value=backend): commit_id = await commit_files_to_repo( db_session, repo_id=repo.repo_id, branch="main", files={"a.py": b"x=1"}, message="test", author="gabriel", ) commit_row = await db_session.get(MusehubCommit, commit_id) assert commit_row is not None assert commit_row.storage_uri is None, ( f"Commits must NOT be written to S3. Got storage_uri={commit_row.storage_uri!r}. " "Commits belong in the DB only — S3 is for blobs." ) # Verify backend.put was NOT called with a commit key for c in backend.put.call_args_list: assert commit_id not in str(c), ( "backend.put must not be called for commit objects" ) # --------------------------------------------------------------------------- # BOS-2 snapshot storage_uri is NULL after push # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_BOS2_snapshot_storage_uri_is_null_after_push( db_session: AsyncSession, ) -> None: """Snapshots must NOT be written to S3 individually — storage_uri stays NULL.""" from musehub.services.musehub_sync import commit_files_to_repo from musehub.db.musehub_repo_models import MusehubCommit as DbCommit repo = await _make_repo(db_session, f"bos-snap-{_uid()}") backend = _mock_backend() with patch("musehub.storage.backends.get_backend", return_value=backend): commit_id = await commit_files_to_repo( db_session, repo_id=repo.repo_id, branch="main", files={"b.py": b"y=2"}, message="test", author="gabriel", ) commit_row = await db_session.get(DbCommit, commit_id) snap_row = await db_session.get(MusehubSnapshot, commit_row.snapshot_id) assert snap_row is not None assert snap_row.storage_uri is None, ( f"Snapshots must NOT be written to S3. Got storage_uri={snap_row.storage_uri!r}. " "Snapshots belong in the DB only — S3 is for blobs." ) # --------------------------------------------------------------------------- # BOS-3 blob storage_uri is mpack:// after push receive # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_BOS3_blob_storage_uri_is_mpack_after_push( db_session: AsyncSession, ) -> None: """Blobs must still get mpack:// storage_uri — byte-range served from MinIO.""" from muse.core.mpack import build_wire_mpack from musehub.services.musehub_wire_push import wire_push_unpack_mpack repo = await _make_repo(db_session, f"bos-blob-{_uid()}") content = b"blob content for bos test" oid = blob_id(content) wire = build_wire_mpack({ "blobs": [{"object_id": oid, "content": content}], "commits": [], "snapshots": [], }) mpack_key = blob_id(wire) backend = _mock_backend() backend.get_mpack = AsyncMock(return_value=wire) with patch("musehub.storage.backends.get_backend", return_value=backend): await wire_push_unpack_mpack( db_session, repo.repo_id, mpack_key, "gabriel", branch="main", head_commit_id="", commits_count=0, blobs_count=1, ) obj_row = await db_session.get(MusehubObject, oid) assert obj_row is not None, "Blob must be in musehub_objects" assert obj_row.storage_uri and obj_row.storage_uri.startswith("mpack://"), ( f"Blob storage_uri must be mpack://, got {obj_row.storage_uri!r}" ) # --------------------------------------------------------------------------- # BOS-4 wire_fetch serves commits from DB, no S3 backend.get() # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_BOS4_commits_served_from_db_not_s3( db_session: AsyncSession, ) -> None: """wire_fetch_mpack must serve commits from DB rows, not S3 backend.get().""" from musehub.services.musehub_wire_fetch import wire_fetch_mpack repo = await _make_repo(db_session, f"bos-fetch-commit-{_uid()}") now = datetime.datetime.now(tz=datetime.timezone.utc) snap_id = fake_id("snap-bos4") commit_id = fake_id("commit-bos4") db_session.add(MusehubSnapshot( snapshot_id=snap_id, manifest_blob=b"\x80", entry_count=0, directories=[], storage_uri=None, )) db_session.add(MusehubSnapshotRef(repo_id=repo.repo_id, snapshot_id=snap_id)) db_session.add(MusehubCommit( commit_id=commit_id, branch="main", parent_ids=[], message="bos4", author="gabriel", timestamp=now, snapshot_id=snap_id, storage_uri=None, )) db_session.add(MusehubCommitRef(repo_id=repo.repo_id, commit_id=commit_id)) from musehub.db.musehub_repo_models import MusehubCommitGraph db_session.add(MusehubCommitGraph( commit_id=commit_id, parent_ids=[], generation=0, snapshot_id=snap_id, )) await db_session.flush() backend = _mock_backend() with patch("musehub.services.musehub_wire_fetch.get_backend", return_value=backend), \ patch("musehub.storage.backends.get_backend", return_value=backend): result = await wire_fetch_mpack( db_session, repo.repo_id, want=[commit_id], have=[] ) # backend.get must NOT be called with a commit_id — commits come from DB get_calls = [str(c) for c in backend.get.call_args_list] commit_s3_reads = [c for c in get_calls if commit_id in c] assert not commit_s3_reads, ( "wire_fetch_mpack must NOT call backend.get() for commits. " f"Got S3 reads: {commit_s3_reads}. Commits are served from DB." ) # --------------------------------------------------------------------------- # BOS-6 merge_proposal commit storage_uri is NULL # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_BOS6_merge_proposal_commit_storage_uri_is_null( db_session: AsyncSession, ) -> None: """Merge commits created server-side must NOT have individual S3 storage_uri.""" from musehub.services import musehub_proposals repo = await _make_repo(db_session, f"bos-merge-{_uid()}") now = datetime.datetime.now(tz=datetime.timezone.utc) snap_id = fake_id("snap-merge-bos") commit_id = fake_id("commit-merge-bos") db_session.add(MusehubSnapshot( snapshot_id=snap_id, manifest_blob=b"\x80", entry_count=0, directories=[], )) db_session.add(MusehubSnapshotRef(repo_id=repo.repo_id, snapshot_id=snap_id)) db_session.add(MusehubCommit( commit_id=commit_id, branch="feat", parent_ids=[], message="feat commit", author="gabriel", timestamp=now, snapshot_id=snap_id, )) db_session.add(MusehubCommitRef(repo_id=repo.repo_id, commit_id=commit_id)) from musehub.core.genesis import compute_branch_id db_session.add(MusehubBranch( branch_id=compute_branch_id(repo.repo_id, "feat"), repo_id=repo.repo_id, name="feat", head_commit_id=commit_id, )) await db_session.flush() proposal = await musehub_proposals.create_proposal( db_session, repo_id=repo.repo_id, title="Merge feat", from_branch="feat", to_branch="main", ) backend = _mock_backend() with patch("musehub.storage.backends.get_backend", return_value=backend): merged = await musehub_proposals.merge_proposal( db_session, repo.repo_id, proposal.proposal_id ) merge_commit = await db_session.get(MusehubCommit, merged.merge_commit_id) assert merge_commit is not None assert merge_commit.storage_uri is None, ( f"Merge commit must NOT be written to S3. Got storage_uri={merge_commit.storage_uri!r}. " "Commits belong in DB only." )