gabriel / musehub public
test_last_commit_index_lookup.py python
294 lines 10.4 KB
Raw
sha256:5601f81903b6c70ddd11bd88a5a257ee6dfd38aa3b85b19746c100c030657f1e chore: update smoke_muse.sh comment to reference rc9 Sonnet 4.6 minor ⚠ breaking 21 days ago
1 """TDD tests for get_last_commit_for_file index-lookup optimization.
2
3 Problem: get_last_commit_for_file scans up to 200 snapshot manifests to find
4 which commit last touched a file — O(N) per page load.
5
6 Fix: query musehub_symbol_history_entries WHERE repo_id=? AND address=?
7 (or address LIKE 'path::%') ordered by committed_at DESC LIMIT 1. The index
8 ix_symbol_history_repo_address makes this O(1). Fall back to the snapshot
9 scan only when no history entries exist for the file.
10
11 Test matrix
12 -----------
13 test_get_last_commit_skips_snapshot_scan_when_history_exists
14 When symbol_history_entries has entries for the file, no snapshot manifests
15 must be fetched at all.
16
17 test_get_last_commit_returns_most_recent_history_entry
18 Returns the commit whose committed_at is latest among file-level entries.
19
20 test_get_last_commit_falls_back_to_snapshot_scan_when_no_history
21 When no history entries exist, falls back to snapshot scan (existing
22 behaviour preserved).
23
24 test_get_last_commit_matches_symbol_entries_for_file
25 Symbol-level addresses (path::Symbol) for the same file also count —
26 if the file path appears as a prefix, the most recent matching entry
27 is used.
28
29 test_get_last_commit_ignores_entries_for_other_files
30 Entries for a different file path don't pollute the result.
31 """
32 from __future__ import annotations
33
34 import secrets
35 from contextlib import asynccontextmanager
36 from datetime import datetime, timezone, timedelta
37 from typing import AsyncGenerator
38 from unittest.mock import AsyncMock, patch
39
40 import msgpack
41 import pytest
42 from sqlalchemy.ext.asyncio import AsyncSession
43
44 from musehub.core.genesis import compute_identity_id, compute_repo_id
45 from musehub.db import database as _database
46 from musehub.db.musehub_intel_models import MusehubSymbolHistoryEntry
47 from musehub.db.musehub_repo_models import MusehubCommit, MusehubCommitRef, MusehubRepo, MusehubSnapshot, MusehubSnapshotRef
48 from musehub.services.musehub_repository import get_last_commit_for_file
49 from musehub.types.json_types import StrDict
50 from muse.core.types import long_id, now_utc_iso
51
52 ManifestBatch = dict[str, StrDict]
53
54 # ── Constants ─────────────────────────────────────────────────────────────────
55
56 _OWNER_ID = compute_identity_id(b"lcf-index-tester")
57 _FILE = "musehub/core/billing.py"
58 _OTHER = "musehub/core/auth.py"
59
60
61 # ── Helpers ───────────────────────────────────────────────────────────────────
62
63 def _uid() -> str:
64 return long_id(secrets.token_hex(32))
65
66
67 def _repo_id() -> str:
68 return compute_repo_id(
69 _OWNER_ID, f"lcf-idx-{secrets.token_hex(4)}", "code", now_utc_iso(),
70 )
71
72
73 def _snap_id() -> str:
74 return long_id(secrets.token_hex(32))
75
76
77 def _obj(tag: str) -> str:
78 return long_id(tag.encode().hex().ljust(64, "0"))
79
80
81 def _blob(manifest: StrDict) -> bytes:
82 return msgpack.packb(manifest, use_bin_type=True)
83
84
85 async def _make_repo(session: AsyncSession) -> str:
86 rid = _repo_id()
87 now = datetime.now(tz=timezone.utc)
88 session.add(MusehubRepo(
89 repo_id=rid, name="lcf-idx-test", owner="lcf-idx-tester",
90 slug="lcf-idx-test", visibility="public", owner_user_id=_OWNER_ID,
91 created_at=now, updated_at=now,
92 ))
93 await session.commit()
94 return rid
95
96
97 async def _snap(session: AsyncSession, repo_id: str, manifest: StrDict) -> str:
98 sid = _snap_id()
99 session.add(MusehubSnapshot(
100 snapshot_id=sid, directories=[],
101 manifest_blob=_blob(manifest), entry_count=len(manifest),
102 ))
103 session.add(MusehubSnapshotRef(repo_id=repo_id, snapshot_id=sid))
104 await session.flush()
105 return sid
106
107
108 async def _commit(
109 session: AsyncSession,
110 repo_id: str,
111 snapshot_id: str,
112 branch: str = "main",
113 offset: int = 0,
114 message: str = "feat: change",
115 ) -> str:
116 cid = _uid()
117 now = datetime.now(tz=timezone.utc) + timedelta(seconds=offset)
118 session.add(MusehubCommit(
119 commit_id=cid, branch=branch, parent_ids=[],
120 message=message, author="tester", timestamp=now,
121 snapshot_id=snapshot_id,
122 ))
123 session.add(MusehubCommitRef(repo_id=repo_id, commit_id=cid))
124 await session.flush()
125 return cid
126
127
128 async def _history_entry(
129 session: AsyncSession,
130 repo_id: str,
131 commit_id: str,
132 address: str,
133 offset: int = 0,
134 op: str = "modify",
135 message: str = "feat: change",
136 ) -> None:
137 """Insert a MusehubSymbolHistoryEntry for the given address."""
138 now = datetime.now(tz=timezone.utc) + timedelta(seconds=offset)
139 session.add(MusehubSymbolHistoryEntry(
140 repo_id=repo_id,
141 address=address,
142 commit_id=commit_id,
143 committed_at=now,
144 author="tester",
145 op=op,
146 message=message,
147 ))
148 await session.flush()
149
150
151 @asynccontextmanager
152 async def _fresh_session() -> AsyncGenerator[AsyncSession, None]:
153 async with _database._async_session_factory() as session:
154 yield session
155
156
157 # ── Tests ─────────────────────────────────────────────────────────────────────
158
159
160 @pytest.mark.anyio
161 async def test_get_last_commit_skips_snapshot_scan_when_history_exists(
162 db_session: AsyncSession,
163 ) -> None:
164 """No snapshot manifests fetched when symbol_history_entries has file entries."""
165 repo_id = await _make_repo(db_session)
166 s1 = await _snap(db_session, repo_id, {_FILE: _obj("v1")})
167 c1 = await _commit(db_session, repo_id, s1, offset=0)
168 await _history_entry(db_session, repo_id, c1, _FILE, offset=0)
169 await db_session.commit()
170
171 batch_calls: list[list[str]] = []
172
173 async def _spy_batch(session: AsyncSession, ids: list[str]) -> ManifestBatch:
174 batch_calls.append(ids)
175 return {}
176
177 with patch(
178 "musehub.services.musehub_repository.get_snapshot_manifests_batch",
179 side_effect=_spy_batch,
180 ):
181 async with _fresh_session() as rs:
182 await get_last_commit_for_file(rs, repo_id, _FILE, c1)
183
184 assert batch_calls == [], (
185 f"get_snapshot_manifests_batch called {len(batch_calls)} time(s) "
186 "even though symbol_history_entries has entries for the file"
187 )
188
189
190 @pytest.mark.anyio
191 async def test_get_last_commit_returns_most_recent_history_entry(
192 db_session: AsyncSession,
193 ) -> None:
194 """Returns the commit with the latest committed_at among file-level entries."""
195 repo_id = await _make_repo(db_session)
196
197 s1 = await _snap(db_session, repo_id, {_FILE: _obj("v1")})
198 c1 = await _commit(db_session, repo_id, s1, offset=0, message="init")
199
200 s2 = await _snap(db_session, repo_id, {_FILE: _obj("v2")})
201 c2 = await _commit(db_session, repo_id, s2, offset=10, message="feat: v2")
202
203 s3 = await _snap(db_session, repo_id, {_FILE: _obj("v3")})
204 c3 = await _commit(db_session, repo_id, s3, offset=20, message="feat: v3")
205
206 await _history_entry(db_session, repo_id, c1, _FILE, offset=0)
207 await _history_entry(db_session, repo_id, c2, _FILE, offset=10)
208 await _history_entry(db_session, repo_id, c3, _FILE, offset=20)
209 await db_session.commit()
210
211 async with _fresh_session() as rs:
212 result = await get_last_commit_for_file(rs, repo_id, _FILE, c3)
213
214 assert result is not None
215 assert result.commit_id == c3, (
216 f"Expected most recent commit {c3}, got {result.commit_id}"
217 )
218
219
220 @pytest.mark.anyio
221 async def test_get_last_commit_falls_back_to_snapshot_scan_when_no_history(
222 db_session: AsyncSession,
223 ) -> None:
224 """Falls back to snapshot scan when no history entries exist for the file."""
225 repo_id = await _make_repo(db_session)
226 s1 = await _snap(db_session, repo_id, {_FILE: _obj("v1")})
227 c1 = await _commit(db_session, repo_id, s1, offset=0, message="init")
228 # No history entries seeded — fallback path must be taken.
229 await db_session.commit()
230
231 async with _fresh_session() as rs:
232 result = await get_last_commit_for_file(rs, repo_id, _FILE, c1)
233
234 assert result is not None
235 assert result.commit_id == c1
236
237
238 @pytest.mark.anyio
239 async def test_get_last_commit_matches_symbol_entries_for_file(
240 db_session: AsyncSession,
241 ) -> None:
242 """Symbol-level addresses (path::Symbol) for the file also trigger index path."""
243 repo_id = await _make_repo(db_session)
244 s1 = await _snap(db_session, repo_id, {_FILE: _obj("v1")})
245 c1 = await _commit(db_session, repo_id, s1, offset=0)
246 s2 = await _snap(db_session, repo_id, {_FILE: _obj("v2")})
247 c2 = await _commit(db_session, repo_id, s2, offset=10, message="feat: update fn")
248
249 # Only a symbol-level entry, no bare file entry.
250 await _history_entry(db_session, repo_id, c2, f"{_FILE}::compute_total", offset=10)
251 await db_session.commit()
252
253 batch_calls: list[list[str]] = []
254
255 async def _spy_batch(session: AsyncSession, ids: list[str]) -> ManifestBatch:
256 batch_calls.append(ids)
257 return {}
258
259 with patch(
260 "musehub.services.musehub_repository.get_snapshot_manifests_batch",
261 side_effect=_spy_batch,
262 ):
263 async with _fresh_session() as rs:
264 result = await get_last_commit_for_file(rs, repo_id, _FILE, c2)
265
266 assert batch_calls == [], "snapshot scan triggered despite symbol history entries"
267 assert result is not None
268 assert result.commit_id == c2
269
270
271 @pytest.mark.anyio
272 async def test_get_last_commit_ignores_entries_for_other_files(
273 db_session: AsyncSession,
274 ) -> None:
275 """History entries for a different file don't affect the result."""
276 repo_id = await _make_repo(db_session)
277
278 s1 = await _snap(db_session, repo_id, {_FILE: _obj("v1"), _OTHER: _obj("o1")})
279 c1 = await _commit(db_session, repo_id, s1, offset=0)
280
281 s2 = await _snap(db_session, repo_id, {_FILE: _obj("v1"), _OTHER: _obj("o2")})
282 c2 = await _commit(db_session, repo_id, s2, offset=10)
283
284 # Only _OTHER has history entries; _FILE has none.
285 await _history_entry(db_session, repo_id, c2, _OTHER, offset=10)
286 await db_session.commit()
287
288 # The index path should NOT be taken for _FILE — must fall back to snapshot scan.
289 async with _fresh_session() as rs:
290 result = await get_last_commit_for_file(rs, repo_id, _FILE, c2)
291
292 # File exists in both snapshots with same oid → c1 introduced it
293 assert result is not None
294 assert result.commit_id == c1
File History 2 commits
sha256:5601f81903b6c70ddd11bd88a5a257ee6dfd38aa3b85b19746c100c030657f1e chore: update smoke_muse.sh comment to reference rc9 Sonnet 4.6 minor 21 days ago
sha256:39e9c4e6f2134da0732e6983268a218178973936f8d7ca03c91f2b5ad42133c8 fix: use read_object_bytes in blob viewer; add zstd magic d… Sonnet 4.6 patch 21 days ago