"""Phase 1 — Failing tests for Bug A and Bug B (issue #76). Bug A: object_refs only written for globally-new objects. When the same objects are pushed to a second repo, _new_oids is empty and _upsert_object_refs is never called for that repo. Bug B: mpack_index byte-range entries have the same filter — pre-existing objects get no inline byte_offset/byte_length row for the second repo. These tests must be RED before any production fix lands. They become the regression guard once Bug A and Bug B are fixed. Tests ----- ORP-1 First push to repo A writes object_refs for repo A. ORP-2 Second push of the SAME mpack to repo B writes object_refs for repo B. ORP-3 First push to repo A writes mpack_index byte ranges for the objects. ORP-4 Second push of the SAME mpack to repo B writes mpack_index byte ranges. ORP-5 object_refs count is correct for each repo independently (no cross-contamination). """ from __future__ import annotations import hashlib import os from unittest.mock import AsyncMock, MagicMock, patch import pytest import pytest_asyncio from sqlalchemy import select, func from sqlalchemy.ext.asyncio import AsyncSession from muse.core.mpack import build_wire_mpack from muse.core.types import blob_id, fake_id from musehub.core.genesis import compute_identity_id, compute_repo_id from musehub.db.musehub_repo_models import MusehubMPackIndex, MusehubObjectRef, MusehubRepo from musehub.services.musehub_repository import create_repo # --------------------------------------------------------------------------- # Test fixtures and shared mpack # --------------------------------------------------------------------------- _OWNER = "gabriel" _IDENTITY_ID = compute_identity_id(b"gabriel") # Three distinct blobs with real content — deterministic IDs. _BLOB_A_CONTENT = b"blob-alpha-content-for-repush-test" _BLOB_B_CONTENT = b"blob-beta-content-for-repush-test" _BLOB_C_CONTENT = b"blob-gamma-content-for-repush-test" def _sha256_id(data: bytes) -> str: return f"sha256:{hashlib.sha256(data).hexdigest()}" _BLOB_A_OID = _sha256_id(_BLOB_A_CONTENT) _BLOB_B_OID = _sha256_id(_BLOB_B_CONTENT) _BLOB_C_OID = _sha256_id(_BLOB_C_CONTENT) _BLOB_OIDS = [_BLOB_A_OID, _BLOB_B_OID, _BLOB_C_OID] # One mpack containing all three blobs — reused across both repo pushes. _SHARED_MPACK_BYTES = build_wire_mpack({ "blobs": [ {"object_id": _BLOB_A_OID, "content": _BLOB_A_CONTENT}, {"object_id": _BLOB_B_OID, "content": _BLOB_B_CONTENT}, {"object_id": _BLOB_C_OID, "content": _BLOB_C_CONTENT}, ], "commits": [], "snapshots": [], "tags": [], }) _SHARED_MPACK_KEY = blob_id(_SHARED_MPACK_BYTES) async def _make_repo(session: AsyncSession, name: str) -> MusehubRepo: r = await create_repo( session, name=name, owner=_OWNER, owner_user_id=_IDENTITY_ID, visibility="public", initialize=False, ) await session.commit() return r def _mock_backend(mpack_bytes: bytes) -> MagicMock: backend = MagicMock() backend.get_mpack = AsyncMock(return_value=mpack_bytes) return backend async def _push(session: AsyncSession, repo_id: str, mpack_bytes: bytes, mpack_key: str) -> dict: from musehub.services.musehub_wire_push import wire_push_unpack_mpack backend = _mock_backend(mpack_bytes) patches = [ patch("musehub.services.musehub_wire.get_backend", return_value=backend), patch("musehub.services.musehub_wire_push.get_backend", return_value=backend), patch("musehub.storage.backends.get_backend", return_value=backend), ] with patches[0], patches[1], patches[2]: result = await wire_push_unpack_mpack( session, repo_id, mpack_key, pusher_id=_OWNER, branch="main", head_commit_id=fake_id("head"), commits_count=0, blobs_count=3, ) return result async def _object_ref_count(session: AsyncSession, repo_id: str) -> int: row = (await session.execute( select(func.count()).where(MusehubObjectRef.repo_id == repo_id) )).scalar_one() return row async def _mpack_index_with_byte_range_count(session: AsyncSession, mpack_id: str) -> int: row = (await session.execute( select(func.count()).where( MusehubMPackIndex.mpack_id == mpack_id, MusehubMPackIndex.entity_type == "object", MusehubMPackIndex.byte_offset.is_not(None), ) )).scalar_one() return row # --------------------------------------------------------------------------- # ORP-1 First push to repo A writes object_refs # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_ORP1_first_push_writes_object_refs_for_repo_a( db_session: AsyncSession, ) -> None: """Baseline: pushing new objects to repo A writes object_refs for repo A.""" repo_a = await _make_repo(db_session, "orp-repo-a-1") await _push(db_session, repo_a.repo_id, _SHARED_MPACK_BYTES, _SHARED_MPACK_KEY) count = await _object_ref_count(db_session, repo_a.repo_id) assert count == 3, ( f"Expected 3 object_refs for repo A after first push, got {count}" ) # --------------------------------------------------------------------------- # ORP-2 Second push of the SAME mpack to repo B writes object_refs for repo B # — Bug A: this FAILS before the fix # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_ORP2_repush_same_mpack_writes_object_refs_for_repo_b( db_session: AsyncSession, ) -> None: """Bug A: pushing the same objects to a second repo must write object_refs. Currently wire_push_unpack_mpack filters to _new_oids (globally-new only). After repo A's push the blobs exist in musehub_objects, so _new_oids is empty for repo B's push and _upsert_object_refs is never called. """ repo_a = await _make_repo(db_session, "orp-repo-a-2") repo_b = await _make_repo(db_session, "orp-repo-b-2") # Push to repo A first — objects land in musehub_objects as globally new. await _push(db_session, repo_a.repo_id, _SHARED_MPACK_BYTES, _SHARED_MPACK_KEY) # Push the identical mpack to repo B — objects are pre-existing. await _push(db_session, repo_b.repo_id, _SHARED_MPACK_BYTES, _SHARED_MPACK_KEY) count = await _object_ref_count(db_session, repo_b.repo_id) assert count == 3, ( f"Bug A: expected 3 object_refs for repo B after repush of same mpack, got {count}. " f"_upsert_object_refs must be called with _cc_oids, not _new_oids." ) # --------------------------------------------------------------------------- # ORP-3 First push to repo A writes mpack_index byte ranges # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_ORP3_first_push_writes_mpack_index_byte_ranges( db_session: AsyncSession, ) -> None: """Baseline: first push writes byte_offset/byte_length for all blobs.""" repo_a = await _make_repo(db_session, "orp-repo-a-3") await _push(db_session, repo_a.repo_id, _SHARED_MPACK_BYTES, _SHARED_MPACK_KEY) count = await _mpack_index_with_byte_range_count(db_session, _SHARED_MPACK_KEY) assert count == 3, ( f"Expected 3 mpack_index rows with byte ranges after first push, got {count}" ) # --------------------------------------------------------------------------- # ORP-4 Second push of the SAME mpack to repo B writes mpack_index byte ranges # — Bug B: this FAILS before the fix # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_ORP4_repush_same_mpack_writes_mpack_index_byte_ranges( db_session: AsyncSession, ) -> None: """Bug B: byte-range entries must be written regardless of object novelty. Currently _new_oids_for_idx uses the same globally-new filter. After repo A's push the objects exist, so repo B's push inserts zero mpack_index rows. """ repo_a = await _make_repo(db_session, "orp-repo-a-4") repo_b = await _make_repo(db_session, "orp-repo-b-4") # Push to repo A — objects and byte ranges written for the first time. await _push(db_session, repo_a.repo_id, _SHARED_MPACK_BYTES, _SHARED_MPACK_KEY) # Wipe the mpack_index rows to simulate a clean repush scenario. await db_session.execute( MusehubMPackIndex.__table__.delete().where( MusehubMPackIndex.mpack_id == _SHARED_MPACK_KEY ) ) await db_session.flush() # Push the identical mpack to repo B — must still write byte ranges. await _push(db_session, repo_b.repo_id, _SHARED_MPACK_BYTES, _SHARED_MPACK_KEY) count = await _mpack_index_with_byte_range_count(db_session, _SHARED_MPACK_KEY) assert count == 3, ( f"Bug B: expected 3 mpack_index byte-range rows after repush, got {count}. " f"Inline byte-range insert must cover _cc_oids not _new_oids." ) # --------------------------------------------------------------------------- # ORP-5 object_refs are per-repo — no cross-contamination # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_ORP5_object_refs_are_scoped_per_repo( db_session: AsyncSession, ) -> None: """object_refs rows must be keyed per repo_id, not shared globally. Both repos must have their own 3 rows — not one set of 3 shared between them. """ repo_a = await _make_repo(db_session, "orp-repo-a-5") repo_b = await _make_repo(db_session, "orp-repo-b-5") await _push(db_session, repo_a.repo_id, _SHARED_MPACK_BYTES, _SHARED_MPACK_KEY) await _push(db_session, repo_b.repo_id, _SHARED_MPACK_BYTES, _SHARED_MPACK_KEY) count_a = await _object_ref_count(db_session, repo_a.repo_id) count_b = await _object_ref_count(db_session, repo_b.repo_id) assert count_a == 3, f"repo A should have 3 object_refs, got {count_a}" assert count_b == 3, f"repo B should have 3 object_refs, got {count_b}"