gabriel / musehub public

test_file_history_performance.py file-level

at sha256:3 · View file ↗ · Intel ↗

History
1 files
1 commits
0 hotspots
0 🧊 dead
0 💥 blast risk
sha256:0 fix: fall back to any indexed mpack in read_object_bytes when push mpac… · gabriel · Jun 17, 2026
1 """TDD tests for _fetch_file_history performance fix.
2
3 Problem: _fetch_file_history loads up to 300 commits and calls
4 get_snapshot_manifest() once per commit — 300 individual DB queries +
5 300 full msgpack deserializations per file page view.
6
7 Fix: batch-fetch all snapshot manifests with a single IN query using
8 get_snapshot_manifests_batch(), then look up the file path in the
9 resulting dict.
10
11 Covers:
12 _fetch_file_history — query count
13 - test_file_history_does_not_call_per_commit_manifest_fetch
14 - test_file_history_calls_batch_fetch_once
15
16 _fetch_file_history — correctness
17 - test_file_history_returns_only_commits_where_file_changed
18 - test_file_history_returns_empty_when_file_not_in_head
19 - test_file_history_returns_empty_when_no_commits
20 - test_file_history_respects_limit
21 - test_file_history_unchanged_file_returns_one_entry
22 """
23 from __future__ import annotations
24
25 import secrets
26 from datetime import datetime, timezone, timedelta
27 from contextlib import asynccontextmanager
28 from typing import AsyncGenerator
29
30 import msgpack
31 import pytest
32 from sqlalchemy.ext.asyncio import AsyncSession
33
34 from muse.core.types import long_id, now_utc_iso
35 from musehub.api.routes.musehub.ui_blob import _fetch_file_history
36 from musehub.core.genesis import compute_identity_id, compute_repo_id
37 from musehub.db import database as _database
38 from musehub.db.musehub_repo_models import MusehubCommit, MusehubCommitRef, MusehubRepo, MusehubSnapshot, MusehubSnapshotRef
39 from musehub.types.json_types import JSONObject, StrDict
40
41 # ---------------------------------------------------------------------------
42 # Helpers
43 # ---------------------------------------------------------------------------
44
45 _OWNER_ID = compute_identity_id(b"perf-tester")
46 _FILE = "musehub/services/billing.py"
47 _OTHER_FILE = "musehub/services/auth.py"
48
49
50 def _uid() -> str:
51 return long_id(secrets.token_hex(32))
52
53
54 def _repo_id() -> str:
55 return compute_repo_id(_OWNER_ID, f"perf-test-{secrets.token_hex(4)}", "code",
56 now_utc_iso())
57
58
59 def _snap_id() -> str:
60 return long_id(secrets.token_hex(32))
61
62
63 def _obj_id(tag: str) -> str:
64 return long_id(tag.encode().hex().ljust(64, "0"))
65
66
67 def _manifest_blob(path_oid: StrDict) -> bytes:
68 return msgpack.packb(path_oid, use_bin_type=True)
69
70
71 async def _make_repo(session: AsyncSession) -> str:
72 rid = _repo_id()
73 now = datetime.now(tz=timezone.utc)
74 session.add(MusehubRepo(
75 repo_id=rid,
76 name="perf-test",
77 owner="perf-tester",
78 slug="perf-test",
79 visibility="public",
80 owner_user_id=_OWNER_ID,
81 created_at=now,
82 updated_at=now,
83 ))
84 await session.commit()
85 return rid
86
87
88 async def _add_snapshot(session: AsyncSession, repo_id: str, manifest: StrDict) -> str:
89 sid = _snap_id()
90 session.add(MusehubSnapshot(
91 snapshot_id=sid,
92 directories=[],
93 manifest_blob=_manifest_blob(manifest),
94 entry_count=len(manifest),
95 ))
96 session.add(MusehubSnapshotRef(repo_id=repo_id, snapshot_id=sid))
97 await session.flush()
98 return sid
99
100
101 @asynccontextmanager
102 async def _fresh_session() -> AsyncGenerator[AsyncSession, None]:
103 """Open a fresh session from the (test-overridden) factory.
104
105 Using the shared db_session for both writes and reads leaves asyncpg in
106 an unexpected state on teardown — this helper avoids that by keeping
107 read calls isolated in their own short-lived session.
108 """
109 async with _database._async_session_factory() as session:
110 yield session
111
112
113 async def _add_commit(
114 session: AsyncSession,
115 repo_id: str,
116 snapshot_id: str,
117 branch: str = "main",
118 ts_offset_seconds: int = 0,
119 message: str = "feat: change",
120 ) -> str:
121 cid = _uid()
122 now = datetime.now(tz=timezone.utc) + timedelta(seconds=ts_offset_seconds)
123 session.add(MusehubCommit(
124 commit_id=cid,
125 branch=branch,
126 parent_ids=[],
127 message=message,
128 author="tester",
129 timestamp=now,
130 snapshot_id=snapshot_id,
131 ))
132 session.add(MusehubCommitRef(repo_id=repo_id, commit_id=cid))
133 await session.flush()
134 return cid
135
136
137 # ---------------------------------------------------------------------------
138 # Query-count tests — these fail until the N+1 is fixed
139 # ---------------------------------------------------------------------------
140
141
142 @pytest.mark.anyio
143 async def test_file_history_does_not_call_per_commit_manifest_fetch(
144 db_session: AsyncSession,
145 monkeypatch: pytest.MonkeyPatch,
146 ) -> None:
147 """get_snapshot_manifest must NOT be called per-commit after the fix.
148
149 The old code called it once per commit in the 300-row loop.
150 The new code must never call the single-snapshot variant inside the loop.
151 """
152 import musehub.api.routes.musehub.ui_blob as _module
153
154 calls: list[str] = []
155
156 async def _spy_single(session: AsyncSession, snapshot_id: str) -> JSONObject: # type: ignore[override]
157 calls.append(snapshot_id)
158 return {}
159
160 monkeypatch.setattr(_module, "get_snapshot_manifest", _spy_single)
161
162 repo_id = await _make_repo(db_session)
163 head_snap = await _add_snapshot(db_session, repo_id, {_FILE: _obj_id("v2")})
164 head_id = await _add_commit(db_session, repo_id, head_snap, ts_offset_seconds=10)
165 await db_session.commit()
166
167 async with _fresh_session() as read_session:
168 await _fetch_file_history(read_session, repo_id, _FILE, head_id)
169
170 assert calls == [], f"get_snapshot_manifest was called {len(calls)} time(s) — N+1 not fixed"
171
172
173 @pytest.mark.anyio
174 async def test_file_history_calls_batch_fetch_once(
175 db_session: AsyncSession,
176 monkeypatch: pytest.MonkeyPatch,
177 ) -> None:
178 """get_snapshot_manifests_batch must be called instead of per-commit fetches."""
179 import musehub.api.routes.musehub.ui_blob as _module
180 from musehub.services import musehub_snapshot as _snap_svc
181
182 batch_calls: list[list[str]] = []
183 _real_batch = _snap_svc.get_snapshot_manifests_batch
184
185 async def _spy_batch(session: AsyncSession, snapshot_ids: list[str]) -> JSONObject: # type: ignore[override]
186 batch_calls.append(list(snapshot_ids))
187 return await _real_batch(session, snapshot_ids)
188
189 monkeypatch.setattr(_module, "get_snapshot_manifests_batch", _spy_batch)
190
191 repo_id = await _make_repo(db_session)
192 head_snap_id = ""
193 head_commit_id = ""
194 for i in range(5):
195 snap_id = await _add_snapshot(db_session, repo_id, {_FILE: _obj_id(f"v{i}")})
196 cid = await _add_commit(db_session, repo_id, snap_id, ts_offset_seconds=i * 10)
197 if i == 4:
198 head_snap_id = snap_id
199 head_commit_id = cid
200
201 await db_session.commit()
202
203 async with _fresh_session() as read_session:
204 await _fetch_file_history(read_session, repo_id, _FILE, head_commit_id)
205
206 assert len(batch_calls) >= 1, "get_snapshot_manifests_batch was never called"
207 all_fetched = [sid for call in batch_calls for sid in call]
208 assert head_snap_id in all_fetched, "head snapshot_id must be included in batch fetch"
209
210
211 # ---------------------------------------------------------------------------
212 # Correctness tests
213 # ---------------------------------------------------------------------------
214
215
216 @pytest.mark.anyio
217 async def test_file_history_returns_only_commits_where_file_changed(
218 db_session: AsyncSession,
219 ) -> None:
220 """Only commits where the file's object_id changes between adjacent snapshots are returned."""
221 repo_id = await _make_repo(db_session)
222
223 # Commit 1: file = v1 (oldest)
224 s1 = await _add_snapshot(db_session, repo_id, {_FILE: _obj_id("v1")})
225 c1 = await _add_commit(db_session, repo_id, s1, ts_offset_seconds=0, message="init")
226
227 # Commit 2: file = v1 (unchanged — should NOT appear in history)
228 s2 = await _add_snapshot(db_session, repo_id, {_FILE: _obj_id("v1")})
229 c2 = await _add_commit(db_session, repo_id, s2, ts_offset_seconds=10, message="no-op")
230
231 # Commit 3: file = v2 (changed — should appear)
232 s3 = await _add_snapshot(db_session, repo_id, {_FILE: _obj_id("v2")})
233 c3 = await _add_commit(db_session, repo_id, s3, ts_offset_seconds=20, message="feat: v2")
234
235 # Commit 4: file = v3 (changed — HEAD, should appear)
236 s4 = await _add_snapshot(db_session, repo_id, {_FILE: _obj_id("v3")})
237 c4 = await _add_commit(db_session, repo_id, s4, ts_offset_seconds=30, message="feat: v3")
238
239 await db_session.commit()
240
241 async with _fresh_session() as read_session:
242 history = await _fetch_file_history(read_session, repo_id, _FILE, c4)
243
244 commit_ids = {h["commit_id_full"] for h in history}
245 assert c4 in commit_ids, "HEAD commit (v3) should be in history"
246 assert c3 in commit_ids, "commit that introduced v2 should be in history"
247 # Walking backward: c4(v3)→c3(v2)→c2(v1)→c1(v1)
248 # c2 appears because the file changed from v2→v1 between c3 and c2.
249 # c1 is skipped because c1 and c2 have the same oid — consecutive duplicates are collapsed.
250 assert c1 not in commit_ids, "c1 has same oid as c2 — consecutive duplicate, should be skipped"
251
252
253 @pytest.mark.anyio
254 async def test_file_history_returns_empty_when_file_not_in_head(
255 db_session: AsyncSession,
256 ) -> None:
257 """Returns [] when the file path does not exist in the head snapshot."""
258 repo_id = await _make_repo(db_session)
259 snap = await _add_snapshot(db_session, repo_id, {_OTHER_FILE: _obj_id("v1")})
260 head_id = await _add_commit(db_session, repo_id, snap)
261 await db_session.commit()
262
263 async with _fresh_session() as read_session:
264 history = await _fetch_file_history(read_session, repo_id, _FILE, head_id)
265 assert history == []
266
267
268 @pytest.mark.anyio
269 async def test_file_history_returns_empty_when_no_commits(
270 db_session: AsyncSession,
271 ) -> None:
272 """Returns [] when the head commit cannot be found in the DB."""
273 repo_id = await _make_repo(db_session)
274 await db_session.commit()
275
276 async with _fresh_session() as read_session:
277 history = await _fetch_file_history(read_session, repo_id, _FILE, _uid())
278 assert history == []
279
280
281 @pytest.mark.anyio
282 async def test_file_history_respects_limit(
283 db_session: AsyncSession,
284 ) -> None:
285 """History is capped at the requested limit even when more changes exist."""
286 repo_id = await _make_repo(db_session)
287
288 head_snap = None
289 head_cid = None
290 for i in range(25):
291 snap = await _add_snapshot(db_session, repo_id, {_FILE: _obj_id(f"v{i}")})
292 cid = await _add_commit(db_session, repo_id, snap, ts_offset_seconds=i * 10)
293 if i == 24:
294 head_snap = snap
295 head_cid = cid
296
297 await db_session.commit()
298
299 async with _fresh_session() as read_session:
300 history = await _fetch_file_history(read_session, repo_id, _FILE, head_cid, limit=5)
301 assert len(history) <= 5
302
303
304 @pytest.mark.anyio
305 async def test_file_history_unchanged_file_returns_one_entry(
306 db_session: AsyncSession,
307 ) -> None:
308 """A file that never changes shows only the initial commit."""
309 repo_id = await _make_repo(db_session)
310
311 head_cid = None
312 for i in range(4):
313 snap = await _add_snapshot(db_session, repo_id, {_FILE: _obj_id("v1")}) # always v1
314 cid = await _add_commit(db_session, repo_id, snap, ts_offset_seconds=i * 10)
315 if i == 3:
316 head_cid = cid
317
318 await db_session.commit()
319
320 async with _fresh_session() as read_session:
321 history = await _fetch_file_history(read_session, repo_id, _FILE, head_cid)
322 assert len(history) == 1