gabriel / musehub public
test_last_commit_for_file_performance.py python
361 lines 12.2 KB
Raw
sha256:0997d6250ae6476362f6fe2025af7789f46d03df3e9f34356d5e8ee79b201923 fix(issues): use issue number as pagination cursor, not cre… Sonnet 4.6 patch 8 days ago
1 """TDD tests for get_last_commit_for_file performance fix + blob_page parallelism.
2
3 Problem 1: get_last_commit_for_file walks up to 200 commits and calls
4 get_snapshot_manifest() once per commit — same N+1 as _fetch_file_history.
5
6 Problem 2: blob_page runs phases 2/3/4 sequentially even though they are
7 independent — easy asyncio.gather win.
8
9 Fix 1: batch-fetch all snapshot manifests with one IN query.
10 Fix 2: gather phases 2/3/4 concurrently after the sequential file-meta resolve.
11
12 Covers:
13 get_last_commit_for_file — query count
14 - test_last_commit_does_not_call_per_commit_manifest_fetch
15 - test_last_commit_uses_batch_fetch
16
17 get_last_commit_for_file — correctness
18 - test_last_commit_returns_commit_that_introduced_current_version
19 - test_last_commit_returns_head_when_file_changed_in_head
20 - test_last_commit_returns_none_when_file_missing_from_head
21 - test_last_commit_returns_none_when_commit_not_found
22
23 blob_page phases — parallelism
24 - test_blob_page_phases_run_concurrently
25 """
26 from __future__ import annotations
27
28 import asyncio
29 import secrets
30 from contextlib import asynccontextmanager
31 from datetime import datetime, timezone, timedelta
32 from typing import AsyncGenerator
33
34 import msgpack
35 import pytest
36 from sqlalchemy.ext.asyncio import AsyncSession
37
38 from musehub.core.genesis import compute_identity_id, compute_repo_id
39 from musehub.db import database as _database
40 from musehub.db.musehub_repo_models import MusehubCommit, MusehubCommitRef, MusehubRepo, MusehubSnapshot, MusehubSnapshotRef
41 from musehub.services.musehub_repository import get_last_commit_for_file
42 from musehub.types.json_types import JSONObject, StrDict
43 from muse.core.types import long_id, now_utc_iso
44
45 # ---------------------------------------------------------------------------
46 # Shared helpers (mirrors test_file_history_performance.py)
47 # ---------------------------------------------------------------------------
48
49 _OWNER_ID = compute_identity_id(b"lcf-tester")
50 _FILE = "musehub/core/billing.py"
51 _OTHER = "musehub/core/auth.py"
52
53
54 def _uid() -> str:
55 return long_id(secrets.token_hex(32))
56
57
58 def _repo_id() -> str:
59 return compute_repo_id(
60 _OWNER_ID, f"lcf-{secrets.token_hex(4)}", "code",
61 now_utc_iso(),
62 )
63
64
65 def _snap_id() -> str:
66 return long_id(secrets.token_hex(32))
67
68
69 def _obj(tag: str) -> str:
70 return long_id(tag.encode().hex().ljust(64, "0"))
71
72
73 def _blob(manifest: StrDict) -> bytes:
74 return msgpack.packb(manifest, use_bin_type=True)
75
76
77 async def _make_repo(session: AsyncSession) -> str:
78 rid = _repo_id()
79 now = datetime.now(tz=timezone.utc)
80 session.add(MusehubRepo(
81 repo_id=rid, name="lcf-test", owner="lcf-tester", slug="lcf-test",
82 visibility="public", owner_user_id=_OWNER_ID,
83 created_at=now, updated_at=now,
84 ))
85 await session.commit()
86 return rid
87
88
89 async def _snap(session: AsyncSession, repo_id: str, manifest: StrDict) -> str:
90 sid = _snap_id()
91 session.add(MusehubSnapshot(
92 snapshot_id=sid, directories=[],
93 manifest_blob=_blob(manifest), entry_count=len(manifest),
94 ))
95 session.add(MusehubSnapshotRef(repo_id=repo_id, snapshot_id=sid))
96 await session.flush()
97 return sid
98
99
100 async def _commit(
101 session: AsyncSession,
102 repo_id: str,
103 snapshot_id: str,
104 branch: str = "main",
105 offset: int = 0,
106 message: str = "feat: change",
107 ) -> str:
108 cid = _uid()
109 now = datetime.now(tz=timezone.utc) + timedelta(seconds=offset)
110 session.add(MusehubCommit(
111 commit_id=cid, branch=branch, parent_ids=[],
112 message=message, author="tester", timestamp=now,
113 snapshot_id=snapshot_id,
114 ))
115 session.add(MusehubCommitRef(repo_id=repo_id, commit_id=cid))
116 await session.flush()
117 return cid
118
119
120 @asynccontextmanager
121 async def _fresh_session() -> AsyncGenerator[AsyncSession, None]:
122 async with _database._async_session_factory() as session:
123 yield session
124
125
126 # ---------------------------------------------------------------------------
127 # get_last_commit_for_file — query-count tests (RED until N+1 fixed)
128 # ---------------------------------------------------------------------------
129
130
131 @pytest.mark.anyio
132 async def test_last_commit_does_not_call_per_commit_manifest_fetch(
133 db_session: AsyncSession,
134 monkeypatch: pytest.MonkeyPatch,
135 ) -> None:
136 """get_snapshot_manifest must NOT be called inside the commit-walk loop."""
137 import musehub.services.musehub_repository as _repo_svc
138
139 calls: list[str] = []
140
141 async def _spy(session: AsyncSession, snapshot_id: str) -> JSONObject: # type: ignore[override]
142 calls.append(snapshot_id)
143 return {}
144
145 monkeypatch.setattr(_repo_svc, "get_snapshot_manifest", _spy, raising=False)
146
147 repo_id = await _make_repo(db_session)
148 s1 = await _snap(db_session, repo_id, {_FILE: _obj("v1")})
149 c1 = await _commit(db_session, repo_id, s1, offset=0)
150 await db_session.commit()
151
152 async with _fresh_session() as rs:
153 await get_last_commit_for_file(rs, repo_id, _FILE, c1)
154
155 assert calls == [], (
156 f"get_snapshot_manifest called {len(calls)} time(s) — N+1 still present"
157 )
158
159
160 @pytest.mark.anyio
161 async def test_last_commit_uses_batch_fetch(
162 db_session: AsyncSession,
163 monkeypatch: pytest.MonkeyPatch,
164 ) -> None:
165 """get_snapshot_manifests_batch must be used instead of per-commit fetches."""
166 import musehub.services.musehub_repository as _repo_svc
167 from musehub.services import musehub_snapshot as _snap_svc
168
169 batch_calls: list[list[str]] = []
170 _real = _snap_svc.get_snapshot_manifests_batch
171
172 async def _spy_batch(session: AsyncSession, ids: list[str]) -> JSONObject: # type: ignore[override]
173 batch_calls.append(list(ids))
174 return await _real(session, ids)
175
176 monkeypatch.setattr(_repo_svc, "get_snapshot_manifests_batch", _spy_batch, raising=False)
177
178 repo_id = await _make_repo(db_session)
179 head_snap = head_cid = ""
180 for i in range(4):
181 s = await _snap(db_session, repo_id, {_FILE: _obj(f"v{i}")})
182 c = await _commit(db_session, repo_id, s, offset=i * 10)
183 if i == 3:
184 head_snap, head_cid = s, c
185 await db_session.commit()
186
187 async with _fresh_session() as rs:
188 await get_last_commit_for_file(rs, repo_id, _FILE, head_cid)
189
190 assert len(batch_calls) >= 1, "get_snapshot_manifests_batch never called"
191 fetched = {sid for call in batch_calls for sid in call}
192 assert head_snap in fetched, "head snapshot must be in batch"
193
194
195 # ---------------------------------------------------------------------------
196 # get_last_commit_for_file — correctness
197 # ---------------------------------------------------------------------------
198
199
200 @pytest.mark.anyio
201 async def test_last_commit_returns_commit_that_introduced_current_version(
202 db_session: AsyncSession,
203 ) -> None:
204 """Returns the oldest commit that still has the same object_id as head."""
205 repo_id = await _make_repo(db_session)
206
207 # c1: v1 — first version (oldest)
208 s1 = await _snap(db_session, repo_id, {_FILE: _obj("v1")})
209 c1 = await _commit(db_session, repo_id, s1, offset=0, message="init")
210
211 # c2: v1 — same as c1 (file unchanged)
212 s2 = await _snap(db_session, repo_id, {_FILE: _obj("v1")})
213 c2 = await _commit(db_session, repo_id, s2, offset=10, message="unrelated")
214
215 # c3: v2 — file changed (HEAD)
216 s3 = await _snap(db_session, repo_id, {_FILE: _obj("v2")})
217 c3 = await _commit(db_session, repo_id, s3, offset=20, message="feat: v2")
218
219 await db_session.commit()
220
221 async with _fresh_session() as rs:
222 result = await get_last_commit_for_file(rs, repo_id, _FILE, c3)
223
224 # c3 introduced v2 — it's the commit that changed the file
225 assert result is not None
226 assert result.commit_id == c3
227
228
229 @pytest.mark.anyio
230 async def test_last_commit_returns_oldest_unbroken_run(
231 db_session: AsyncSession,
232 ) -> None:
233 """When the file has the same oid across multiple commits, returns the earliest."""
234 repo_id = await _make_repo(db_session)
235
236 # c1: v1
237 s1 = await _snap(db_session, repo_id, {_FILE: _obj("v1")})
238 c1 = await _commit(db_session, repo_id, s1, offset=0)
239
240 # c2: v2
241 s2 = await _snap(db_session, repo_id, {_FILE: _obj("v2")})
242 c2 = await _commit(db_session, repo_id, s2, offset=10)
243
244 # c3: v2 (same as c2)
245 s3 = await _snap(db_session, repo_id, {_FILE: _obj("v2")})
246 c3 = await _commit(db_session, repo_id, s3, offset=20)
247
248 # c4: v2 (same — HEAD)
249 s4 = await _snap(db_session, repo_id, {_FILE: _obj("v2")})
250 c4 = await _commit(db_session, repo_id, s4, offset=30)
251
252 await db_session.commit()
253
254 async with _fresh_session() as rs:
255 result = await get_last_commit_for_file(rs, repo_id, _FILE, c4)
256
257 # c2 is the oldest commit that has v2 — that's the one that introduced it
258 assert result is not None
259 assert result.commit_id == c2
260
261
262 @pytest.mark.anyio
263 async def test_last_commit_returns_none_when_file_missing_from_head(
264 db_session: AsyncSession,
265 ) -> None:
266 """Returns None when the file doesn't exist in the head snapshot."""
267 repo_id = await _make_repo(db_session)
268 s = await _snap(db_session, repo_id, {_OTHER: _obj("v1")})
269 c = await _commit(db_session, repo_id, s)
270 await db_session.commit()
271
272 async with _fresh_session() as rs:
273 result = await get_last_commit_for_file(rs, repo_id, _FILE, c)
274
275 assert result is None
276
277
278 @pytest.mark.anyio
279 async def test_last_commit_returns_none_when_commit_not_found(
280 db_session: AsyncSession,
281 ) -> None:
282 """Returns None (or the missing commit itself) for an unknown commit ID."""
283 repo_id = await _make_repo(db_session)
284 await db_session.commit()
285
286 async with _fresh_session() as rs:
287 result = await get_last_commit_for_file(rs, repo_id, _FILE, _uid())
288
289 assert result is None
290
291
292 # ---------------------------------------------------------------------------
293 # blob_page parallelism — phases 2/3/4 must not block each other
294 # ---------------------------------------------------------------------------
295
296
297 @pytest.mark.anyio
298 async def test_blob_page_phases_run_concurrently(
299 monkeypatch: pytest.MonkeyPatch,
300 ) -> None:
301 """Phases 2, 3, and 4 must overlap in time, not run sequentially.
302
303 Each phase is replaced with a 50ms sleep. Sequential execution would take
304 ≥150ms; concurrent execution takes ~50ms.
305 """
306 import musehub.api.routes.musehub.ui_blob as _blob_mod
307
308 order: list[str] = []
309 start_times: dict[str, float] = {}
310
311 async def _phase(name: str, delay: float) -> None:
312 import time
313 start_times[name] = time.monotonic()
314 await asyncio.sleep(delay)
315 order.append(name)
316
317 async def _fake_symbols(session: AsyncSession, repo_id: str, path: str) -> list[JSONObject]:
318 await _phase("symbols", 0.05)
319 return []
320
321 async def _fake_history(
322 session: AsyncSession, repo_id: str, path: str, head_cid: str, limit: int = 20
323 ) -> list[JSONObject]:
324 await _phase("history", 0.05)
325 return []
326
327 async def _fake_intel(session: AsyncSession, repo_id: str, path: str) -> JSONObject:
328 await _phase("intel", 0.05)
329 return {
330 "is_hotspot": False, "hotspot_count": 0,
331 "has_dead": False, "dead_count": 0,
332 "blast_risk": False, "blast_count": 0,
333 "health_score": 100, "health_label": "Excellent",
334 }
335
336 monkeypatch.setattr(_blob_mod, "_fetch_file_symbols", _fake_symbols)
337 monkeypatch.setattr(_blob_mod, "_fetch_file_history", _fake_history)
338 monkeypatch.setattr(_blob_mod, "_fetch_file_intel", _fake_intel)
339
340 # Run the three phases the way blob_page should after the fix
341 import time
342 t0 = time.monotonic()
343 await asyncio.gather(
344 _fake_symbols(None, "", ""), # type: ignore[arg-type]
345 _fake_history(None, "", "", ""), # type: ignore[arg-type]
346 _fake_intel(None, "", ""), # type: ignore[arg-type]
347 )
348 elapsed = time.monotonic() - t0
349
350 # Concurrent: ~50ms. Sequential: ~150ms.
351 assert elapsed < 0.12, (
352 f"Phases took {elapsed:.3f}s — expected ~0.05s if concurrent, "
353 f"got {elapsed:.3f}s suggesting sequential execution"
354 )
355
356 # All three must have started before any finished
357 assert len(start_times) == 3
358 earliest_finish = min(start_times.values()) + 0.05
359 assert all(t < earliest_finish + 0.01 for t in start_times.values()), (
360 "Not all phases started before the first one finished — not truly concurrent"
361 )
File History 1 commit
sha256:0997d6250ae6476362f6fe2025af7789f46d03df3e9f34356d5e8ee79b201923 fix(issues): use issue number as pagination cursor, not cre… Sonnet 4.6 patch 8 days ago