gabriel / musehub public
test_last_commit_for_file_performance.py python
359 lines 12.1 KB
Raw
sha256:9b711047e27df5ac91681c74aadfb0e31f69ffd4269932ea52f0c113764d8c0a docs(phase-03): rewrite Domain Protocol — AddressedMergePlu… Sonnet 4.6 minor ⚠ breaking 23 days ago
1 """TDD tests for get_last_commit_for_file performance fix + blob_page parallelism.
2
3 Problem 1: get_last_commit_for_file walks up to 200 commits and calls
4 get_snapshot_manifest() once per commit — same N+1 as _fetch_file_history.
5
6 Problem 2: blob_page runs phases 2/3/4 sequentially even though they are
7 independent — easy asyncio.gather win.
8
9 Fix 1: batch-fetch all snapshot manifests with one IN query.
10 Fix 2: gather phases 2/3/4 concurrently after the sequential file-meta resolve.
11
12 Covers:
13 get_last_commit_for_file — query count
14 - test_last_commit_does_not_call_per_commit_manifest_fetch
15 - test_last_commit_uses_batch_fetch
16
17 get_last_commit_for_file — correctness
18 - test_last_commit_returns_commit_that_introduced_current_version
19 - test_last_commit_returns_head_when_file_changed_in_head
20 - test_last_commit_returns_none_when_file_missing_from_head
21 - test_last_commit_returns_none_when_commit_not_found
22
23 blob_page phases — parallelism
24 - test_blob_page_phases_run_concurrently
25 """
26 from __future__ import annotations
27
28 import asyncio
29 import secrets
30 from contextlib import asynccontextmanager
31 from datetime import datetime, timezone, timedelta
32 from typing import AsyncGenerator
33
34 import msgpack
35 import pytest
36 from sqlalchemy.ext.asyncio import AsyncSession
37
38 from musehub.core.genesis import compute_identity_id, compute_repo_id
39 from musehub.db import database as _database
40 from musehub.db.musehub_repo_models import MusehubCommit, MusehubRepo, MusehubSnapshot
41 from musehub.services.musehub_repository import get_last_commit_for_file
42 from musehub.types.json_types import JSONObject, StrDict
43 from muse.core.types import long_id, now_utc_iso
44
45 # ---------------------------------------------------------------------------
46 # Shared helpers (mirrors test_file_history_performance.py)
47 # ---------------------------------------------------------------------------
48
49 _OWNER_ID = compute_identity_id(b"lcf-tester")
50 _FILE = "musehub/core/billing.py"
51 _OTHER = "musehub/core/auth.py"
52
53
54 def _uid() -> str:
55 return long_id(secrets.token_hex(32))
56
57
58 def _repo_id() -> str:
59 return compute_repo_id(
60 _OWNER_ID, f"lcf-{secrets.token_hex(4)}", "code",
61 now_utc_iso(),
62 )
63
64
65 def _snap_id() -> str:
66 return long_id(secrets.token_hex(32))
67
68
69 def _obj(tag: str) -> str:
70 return long_id(tag.encode().hex().ljust(64, "0"))
71
72
73 def _blob(manifest: StrDict) -> bytes:
74 return msgpack.packb(manifest, use_bin_type=True)
75
76
77 async def _make_repo(session: AsyncSession) -> str:
78 rid = _repo_id()
79 now = datetime.now(tz=timezone.utc)
80 session.add(MusehubRepo(
81 repo_id=rid, name="lcf-test", owner="lcf-tester", slug="lcf-test",
82 visibility="public", owner_user_id=_OWNER_ID,
83 created_at=now, updated_at=now,
84 ))
85 await session.commit()
86 return rid
87
88
89 async def _snap(session: AsyncSession, repo_id: str, manifest: StrDict) -> str:
90 sid = _snap_id()
91 session.add(MusehubSnapshot(
92 snapshot_id=sid, repo_id=repo_id, directories=[],
93 manifest_blob=_blob(manifest), entry_count=len(manifest),
94 ))
95 await session.flush()
96 return sid
97
98
99 async def _commit(
100 session: AsyncSession,
101 repo_id: str,
102 snapshot_id: str,
103 branch: str = "main",
104 offset: int = 0,
105 message: str = "feat: change",
106 ) -> str:
107 cid = _uid()
108 now = datetime.now(tz=timezone.utc) + timedelta(seconds=offset)
109 session.add(MusehubCommit(
110 commit_id=cid, repo_id=repo_id, branch=branch, parent_ids=[],
111 message=message, author="tester", timestamp=now,
112 snapshot_id=snapshot_id,
113 ))
114 await session.flush()
115 return cid
116
117
118 @asynccontextmanager
119 async def _fresh_session() -> AsyncGenerator[AsyncSession, None]:
120 async with _database._async_session_factory() as session:
121 yield session
122
123
124 # ---------------------------------------------------------------------------
125 # get_last_commit_for_file — query-count tests (RED until N+1 fixed)
126 # ---------------------------------------------------------------------------
127
128
129 @pytest.mark.anyio
130 async def test_last_commit_does_not_call_per_commit_manifest_fetch(
131 db_session: AsyncSession,
132 monkeypatch: pytest.MonkeyPatch,
133 ) -> None:
134 """get_snapshot_manifest must NOT be called inside the commit-walk loop."""
135 import musehub.services.musehub_repository as _repo_svc
136
137 calls: list[str] = []
138
139 async def _spy(session: AsyncSession, snapshot_id: str) -> JSONObject: # type: ignore[override]
140 calls.append(snapshot_id)
141 return {}
142
143 monkeypatch.setattr(_repo_svc, "get_snapshot_manifest", _spy, raising=False)
144
145 repo_id = await _make_repo(db_session)
146 s1 = await _snap(db_session, repo_id, {_FILE: _obj("v1")})
147 c1 = await _commit(db_session, repo_id, s1, offset=0)
148 await db_session.commit()
149
150 async with _fresh_session() as rs:
151 await get_last_commit_for_file(rs, repo_id, _FILE, c1)
152
153 assert calls == [], (
154 f"get_snapshot_manifest called {len(calls)} time(s) — N+1 still present"
155 )
156
157
158 @pytest.mark.anyio
159 async def test_last_commit_uses_batch_fetch(
160 db_session: AsyncSession,
161 monkeypatch: pytest.MonkeyPatch,
162 ) -> None:
163 """get_snapshot_manifests_batch must be used instead of per-commit fetches."""
164 import musehub.services.musehub_repository as _repo_svc
165 from musehub.services import musehub_snapshot as _snap_svc
166
167 batch_calls: list[list[str]] = []
168 _real = _snap_svc.get_snapshot_manifests_batch
169
170 async def _spy_batch(session: AsyncSession, ids: list[str]) -> JSONObject: # type: ignore[override]
171 batch_calls.append(list(ids))
172 return await _real(session, ids)
173
174 monkeypatch.setattr(_repo_svc, "get_snapshot_manifests_batch", _spy_batch, raising=False)
175
176 repo_id = await _make_repo(db_session)
177 head_snap = head_cid = ""
178 for i in range(4):
179 s = await _snap(db_session, repo_id, {_FILE: _obj(f"v{i}")})
180 c = await _commit(db_session, repo_id, s, offset=i * 10)
181 if i == 3:
182 head_snap, head_cid = s, c
183 await db_session.commit()
184
185 async with _fresh_session() as rs:
186 await get_last_commit_for_file(rs, repo_id, _FILE, head_cid)
187
188 assert len(batch_calls) >= 1, "get_snapshot_manifests_batch never called"
189 fetched = {sid for call in batch_calls for sid in call}
190 assert head_snap in fetched, "head snapshot must be in batch"
191
192
193 # ---------------------------------------------------------------------------
194 # get_last_commit_for_file — correctness
195 # ---------------------------------------------------------------------------
196
197
198 @pytest.mark.anyio
199 async def test_last_commit_returns_commit_that_introduced_current_version(
200 db_session: AsyncSession,
201 ) -> None:
202 """Returns the oldest commit that still has the same object_id as head."""
203 repo_id = await _make_repo(db_session)
204
205 # c1: v1 — first version (oldest)
206 s1 = await _snap(db_session, repo_id, {_FILE: _obj("v1")})
207 c1 = await _commit(db_session, repo_id, s1, offset=0, message="init")
208
209 # c2: v1 — same as c1 (file unchanged)
210 s2 = await _snap(db_session, repo_id, {_FILE: _obj("v1")})
211 c2 = await _commit(db_session, repo_id, s2, offset=10, message="unrelated")
212
213 # c3: v2 — file changed (HEAD)
214 s3 = await _snap(db_session, repo_id, {_FILE: _obj("v2")})
215 c3 = await _commit(db_session, repo_id, s3, offset=20, message="feat: v2")
216
217 await db_session.commit()
218
219 async with _fresh_session() as rs:
220 result = await get_last_commit_for_file(rs, repo_id, _FILE, c3)
221
222 # c3 introduced v2 — it's the commit that changed the file
223 assert result is not None
224 assert result.commit_id == c3
225
226
227 @pytest.mark.anyio
228 async def test_last_commit_returns_oldest_unbroken_run(
229 db_session: AsyncSession,
230 ) -> None:
231 """When the file has the same oid across multiple commits, returns the earliest."""
232 repo_id = await _make_repo(db_session)
233
234 # c1: v1
235 s1 = await _snap(db_session, repo_id, {_FILE: _obj("v1")})
236 c1 = await _commit(db_session, repo_id, s1, offset=0)
237
238 # c2: v2
239 s2 = await _snap(db_session, repo_id, {_FILE: _obj("v2")})
240 c2 = await _commit(db_session, repo_id, s2, offset=10)
241
242 # c3: v2 (same as c2)
243 s3 = await _snap(db_session, repo_id, {_FILE: _obj("v2")})
244 c3 = await _commit(db_session, repo_id, s3, offset=20)
245
246 # c4: v2 (same — HEAD)
247 s4 = await _snap(db_session, repo_id, {_FILE: _obj("v2")})
248 c4 = await _commit(db_session, repo_id, s4, offset=30)
249
250 await db_session.commit()
251
252 async with _fresh_session() as rs:
253 result = await get_last_commit_for_file(rs, repo_id, _FILE, c4)
254
255 # c2 is the oldest commit that has v2 — that's the one that introduced it
256 assert result is not None
257 assert result.commit_id == c2
258
259
260 @pytest.mark.anyio
261 async def test_last_commit_returns_none_when_file_missing_from_head(
262 db_session: AsyncSession,
263 ) -> None:
264 """Returns None when the file doesn't exist in the head snapshot."""
265 repo_id = await _make_repo(db_session)
266 s = await _snap(db_session, repo_id, {_OTHER: _obj("v1")})
267 c = await _commit(db_session, repo_id, s)
268 await db_session.commit()
269
270 async with _fresh_session() as rs:
271 result = await get_last_commit_for_file(rs, repo_id, _FILE, c)
272
273 assert result is None
274
275
276 @pytest.mark.anyio
277 async def test_last_commit_returns_none_when_commit_not_found(
278 db_session: AsyncSession,
279 ) -> None:
280 """Returns None (or the missing commit itself) for an unknown commit ID."""
281 repo_id = await _make_repo(db_session)
282 await db_session.commit()
283
284 async with _fresh_session() as rs:
285 result = await get_last_commit_for_file(rs, repo_id, _FILE, _uid())
286
287 assert result is None
288
289
290 # ---------------------------------------------------------------------------
291 # blob_page parallelism — phases 2/3/4 must not block each other
292 # ---------------------------------------------------------------------------
293
294
295 @pytest.mark.anyio
296 async def test_blob_page_phases_run_concurrently(
297 monkeypatch: pytest.MonkeyPatch,
298 ) -> None:
299 """Phases 2, 3, and 4 must overlap in time, not run sequentially.
300
301 Each phase is replaced with a 50ms sleep. Sequential execution would take
302 ≥150ms; concurrent execution takes ~50ms.
303 """
304 import musehub.api.routes.musehub.ui_blob as _blob_mod
305
306 order: list[str] = []
307 start_times: dict[str, float] = {}
308
309 async def _phase(name: str, delay: float) -> None:
310 import time
311 start_times[name] = time.monotonic()
312 await asyncio.sleep(delay)
313 order.append(name)
314
315 async def _fake_symbols(session: AsyncSession, repo_id: str, path: str) -> list[JSONObject]:
316 await _phase("symbols", 0.05)
317 return []
318
319 async def _fake_history(
320 session: AsyncSession, repo_id: str, path: str, head_cid: str, limit: int = 20
321 ) -> list[JSONObject]:
322 await _phase("history", 0.05)
323 return []
324
325 async def _fake_intel(session: AsyncSession, repo_id: str, path: str) -> JSONObject:
326 await _phase("intel", 0.05)
327 return {
328 "is_hotspot": False, "hotspot_count": 0,
329 "has_dead": False, "dead_count": 0,
330 "blast_risk": False, "blast_count": 0,
331 "health_score": 100, "health_label": "Excellent",
332 }
333
334 monkeypatch.setattr(_blob_mod, "_fetch_file_symbols", _fake_symbols)
335 monkeypatch.setattr(_blob_mod, "_fetch_file_history", _fake_history)
336 monkeypatch.setattr(_blob_mod, "_fetch_file_intel", _fake_intel)
337
338 # Run the three phases the way blob_page should after the fix
339 import time
340 t0 = time.monotonic()
341 await asyncio.gather(
342 _fake_symbols(None, "", ""), # type: ignore[arg-type]
343 _fake_history(None, "", "", ""), # type: ignore[arg-type]
344 _fake_intel(None, "", ""), # type: ignore[arg-type]
345 )
346 elapsed = time.monotonic() - t0
347
348 # Concurrent: ~50ms. Sequential: ~150ms.
349 assert elapsed < 0.12, (
350 f"Phases took {elapsed:.3f}s — expected ~0.05s if concurrent, "
351 f"got {elapsed:.3f}s suggesting sequential execution"
352 )
353
354 # All three must have started before any finished
355 assert len(start_times) == 3
356 earliest_finish = min(start_times.values()) + 0.05
357 assert all(t < earliest_finish + 0.01 for t in start_times.values()), (
358 "Not all phases started before the first one finished — not truly concurrent"
359 )
File History 1 commit
sha256:9b711047e27df5ac91681c74aadfb0e31f69ffd4269932ea52f0c113764d8c0a docs(phase-03): rewrite Domain Protocol — AddressedMergePlu… Sonnet 4.6 minor 23 days ago