gabriel / musehub public
test_file_last_commits.py python
397 lines 16.6 KB
Raw
sha256:009b5a222314f47640a58d75ce5a1f428f1624cf0b51384dfcdfbdfab3cc42a4 feat: migration idempotency, file attribution DAG walk, mpa… Sonnet 4.6 minor ⚠ breaking 15 days ago
1 """
2 Tests for materialized per-file last-commit data.
3
4 FLC1 — get_file_last_commits returns empty dict when table has no rows for repo
5 FLC2 — compute_and_store_file_last_commits populates table from commit history
6 FLC3 — get_file_last_commits reads from table (single query, no blob decode)
7 FLC4 — file changed in newer commit → attributed to newer commit
8 FLC5 — file unchanged since first commit → attributed to oldest commit
9 FLC6 — directory path returns the commit of its most-recently-changed file
10 FLC7 — compute is idempotent: running twice gives same result, no duplicates
11 FLC8 — only paths requested are returned (no extra rows leaked)
12 FLC9 — unknown paths return no entry (no crash)
13 FLC10 — second push updates attribution for files that changed
14 FLC11 — more than 100 commits does not raise (batch chunking)
15 """
16 from __future__ import annotations
17
18 import hashlib
19 from collections.abc import Mapping
20 from datetime import datetime, timezone, timedelta
21
22 import pytest
23 from sqlalchemy.ext.asyncio import AsyncSession
24 from sqlalchemy import select
25
26 from musehub.db.musehub_intel_models import MusehubFileLastCommit
27 from musehub.db.musehub_repo_models import MusehubCommit, MusehubCommitRef, MusehubSnapshot, MusehubSnapshotRef
28 from tests.factories import create_repo, create_branch
29
30
31 # ---------------------------------------------------------------------------
32 # Helpers
33 # ---------------------------------------------------------------------------
34
35 def _utc(offset_days: int = 0) -> datetime:
36 return datetime.now(tz=timezone.utc) + timedelta(days=offset_days)
37
38
39 def _snap_id(name: str) -> str:
40 return "sha256:" + hashlib.sha256(name.encode()).hexdigest()
41
42
43 def _commit_id(name: str) -> str:
44 return "sha256:" + hashlib.sha256(f"commit:{name}".encode()).hexdigest()
45
46
47 def _obj_id(name: str) -> str:
48 return "sha256:" + hashlib.sha256(f"obj:{name}".encode()).hexdigest()
49
50
51 async def _add_snapshot(session: AsyncSession, repo_id: str, snap_name: str, manifest: Mapping[str, str]) -> str:
52 """Store a snapshot with manifest blob."""
53 import msgpack
54 snap_id = _snap_id(snap_name)
55 existing = await session.get(MusehubSnapshot, snap_id)
56 if existing is None:
57 session.add(MusehubSnapshot(
58 snapshot_id=snap_id,
59 directories=[],
60 manifest_blob=msgpack.packb(manifest, use_bin_type=True),
61 entry_count=len(manifest),
62 created_at=_utc(),
63 ))
64 session.add(MusehubSnapshotRef(repo_id=repo_id, snapshot_id=snap_id))
65 await session.flush()
66 return snap_id
67
68
69 async def _add_commit(
70 session: AsyncSession,
71 repo_id: str,
72 name: str,
73 snap_name: str,
74 manifest: dict[str, str],
75 branch: str = "main",
76 ts_offset: int = 0,
77 agent_id: str = "",
78 message: str = "",
79 ) -> MusehubCommit:
80 snap_id = await _add_snapshot(session, repo_id, snap_name, manifest)
81 commit = MusehubCommit(
82 commit_id=_commit_id(name),
83 branch=branch,
84 parent_ids=[],
85 message=message or f"commit {name}",
86 author="gabriel",
87 timestamp=_utc(ts_offset),
88 snapshot_id=snap_id,
89 agent_id=agent_id,
90 )
91 session.add(commit)
92 session.add(MusehubCommitRef(repo_id=repo_id, commit_id=commit.commit_id))
93 await session.flush()
94 return commit
95
96
97 # ---------------------------------------------------------------------------
98 # FLC1 — empty table → empty result
99 # ---------------------------------------------------------------------------
100
101 @pytest.mark.asyncio
102 async def test_flc1_empty_table_returns_empty(db_session: AsyncSession) -> None:
103 """FLC1: no rows in table → empty dict, no crash."""
104 from musehub.services.musehub_repository import get_file_last_commits
105
106 repo = await create_repo(db_session)
107 result = await get_file_last_commits(db_session, repo.repo_id, ["README.md"], ref="main")
108 assert result == {}
109
110
111 # ---------------------------------------------------------------------------
112 # FLC2 — compute populates table
113 # ---------------------------------------------------------------------------
114
115 @pytest.mark.asyncio
116 async def test_flc2_compute_populates_table(db_session: AsyncSession) -> None:
117 """FLC2: compute_and_store_file_last_commits writes rows to musehub_file_last_commits."""
118 from musehub.services.file_last_commits import compute_and_store_file_last_commits
119
120 repo = await create_repo(db_session)
121 branch = await create_branch(db_session, repo.repo_id, name="main")
122
123 manifest = {"README.md": _obj_id("readme"), "src/app.py": _obj_id("app")}
124 commit = await _add_commit(db_session, repo.repo_id, "c1", "s1", manifest)
125 branch.head_commit_id = commit.commit_id
126 await db_session.flush()
127
128 await compute_and_store_file_last_commits(db_session, repo.repo_id, "main", commit.commit_id)
129 await db_session.flush()
130
131 rows = (await db_session.execute(
132 select(MusehubFileLastCommit).where(
133 MusehubFileLastCommit.repo_id == repo.repo_id,
134 MusehubFileLastCommit.branch == "main",
135 )
136 )).scalars().all()
137
138 paths = {r.path for r in rows}
139 assert "README.md" in paths
140 assert "src/app.py" in paths
141
142
143 # ---------------------------------------------------------------------------
144 # FLC3 — get_file_last_commits reads from table
145 # ---------------------------------------------------------------------------
146
147 @pytest.mark.asyncio
148 async def test_flc3_reads_from_table(db_session: AsyncSession) -> None:
149 """FLC3: after compute, get_file_last_commits returns data without blob decode."""
150 from musehub.services.file_last_commits import compute_and_store_file_last_commits
151 from musehub.services.musehub_repository import get_file_last_commits
152
153 repo = await create_repo(db_session)
154 branch = await create_branch(db_session, repo.repo_id, name="main")
155
156 manifest = {"README.md": _obj_id("readme-v1")}
157 commit = await _add_commit(db_session, repo.repo_id, "c1", "s1", manifest, message="feat: init")
158 branch.head_commit_id = commit.commit_id
159 await db_session.flush()
160
161 await compute_and_store_file_last_commits(db_session, repo.repo_id, "main", commit.commit_id)
162 await db_session.flush()
163
164 result = await get_file_last_commits(db_session, repo.repo_id, ["README.md"], ref="main")
165
166 assert "README.md" in result
167 assert result["README.md"]["sha"] == commit.commit_id
168 assert result["README.md"]["message"] == "feat: init"
169
170
171 # ---------------------------------------------------------------------------
172 # FLC4 — changed file attributed to newer commit
173 # ---------------------------------------------------------------------------
174
175 @pytest.mark.asyncio
176 async def test_flc4_changed_file_attributed_to_newer_commit(db_session: AsyncSession) -> None:
177 """FLC4: file that changed in commit 2 is attributed to commit 2, not commit 1."""
178 from musehub.services.file_last_commits import compute_and_store_file_last_commits
179 from musehub.services.musehub_repository import get_file_last_commits
180
181 repo = await create_repo(db_session)
182 branch = await create_branch(db_session, repo.repo_id, name="main")
183
184 manifest_v1 = {"README.md": _obj_id("readme-v1"), "src/app.py": _obj_id("app-v1")}
185 manifest_v2 = {"README.md": _obj_id("readme-v2"), "src/app.py": _obj_id("app-v1")}
186
187 c1 = await _add_commit(db_session, repo.repo_id, "c1", "s1", manifest_v1, ts_offset=-1)
188 c2 = await _add_commit(db_session, repo.repo_id, "c2", "s2", manifest_v2, ts_offset=0)
189 branch.head_commit_id = c2.commit_id
190 await db_session.flush()
191
192 await compute_and_store_file_last_commits(db_session, repo.repo_id, "main", c2.commit_id)
193 await db_session.flush()
194
195 result = await get_file_last_commits(db_session, repo.repo_id, ["README.md", "src/app.py"], ref="main")
196
197 assert result["README.md"]["sha"] == c2.commit_id
198 assert result["src/app.py"]["sha"] == c1.commit_id
199
200
201 # ---------------------------------------------------------------------------
202 # FLC5 — unchanged file attributed to first commit
203 # ---------------------------------------------------------------------------
204
205 @pytest.mark.asyncio
206 async def test_flc5_unchanged_file_attributed_to_oldest_commit(db_session: AsyncSession) -> None:
207 """FLC5: file never changed is attributed to the oldest commit in the walk."""
208 from musehub.services.file_last_commits import compute_and_store_file_last_commits
209 from musehub.services.musehub_repository import get_file_last_commits
210
211 repo = await create_repo(db_session)
212 branch = await create_branch(db_session, repo.repo_id, name="main")
213
214 oid = _obj_id("stable")
215 c1 = await _add_commit(db_session, repo.repo_id, "c1", "s1", {"stable.py": oid}, ts_offset=-2)
216 c2 = await _add_commit(db_session, repo.repo_id, "c2", "s2", {"stable.py": oid}, ts_offset=-1)
217 c3 = await _add_commit(db_session, repo.repo_id, "c3", "s3", {"stable.py": oid}, ts_offset=0)
218 branch.head_commit_id = c3.commit_id
219 await db_session.flush()
220
221 await compute_and_store_file_last_commits(db_session, repo.repo_id, "main", c3.commit_id)
222 await db_session.flush()
223
224 result = await get_file_last_commits(db_session, repo.repo_id, ["stable.py"], ref="main")
225 assert result["stable.py"]["sha"] == c1.commit_id
226
227
228 # ---------------------------------------------------------------------------
229 # FLC6 — directory path → most-recently-changed file in dir
230 # ---------------------------------------------------------------------------
231
232 @pytest.mark.asyncio
233 async def test_flc6_directory_attributed_to_most_recent_child_commit(db_session: AsyncSession) -> None:
234 """FLC6: directory path resolves to the commit that last touched any file inside it."""
235 from musehub.services.file_last_commits import compute_and_store_file_last_commits
236 from musehub.services.musehub_repository import get_file_last_commits
237
238 repo = await create_repo(db_session)
239 branch = await create_branch(db_session, repo.repo_id, name="main")
240
241 m1 = {"src/a.py": _obj_id("a-v1"), "src/b.py": _obj_id("b-v1")}
242 m2 = {"src/a.py": _obj_id("a-v1"), "src/b.py": _obj_id("b-v2")}
243
244 c1 = await _add_commit(db_session, repo.repo_id, "c1", "s1", m1, ts_offset=-1)
245 c2 = await _add_commit(db_session, repo.repo_id, "c2", "s2", m2, ts_offset=0)
246 branch.head_commit_id = c2.commit_id
247 await db_session.flush()
248
249 await compute_and_store_file_last_commits(db_session, repo.repo_id, "main", c2.commit_id)
250 await db_session.flush()
251
252 result = await get_file_last_commits(db_session, repo.repo_id, ["src"], ref="main")
253 assert result["src"]["sha"] == c2.commit_id
254
255
256 # ---------------------------------------------------------------------------
257 # FLC7 — idempotent
258 # ---------------------------------------------------------------------------
259
260 @pytest.mark.asyncio
261 async def test_flc7_compute_is_idempotent(db_session: AsyncSession) -> None:
262 """FLC7: running compute twice yields same result, no duplicate rows."""
263 from musehub.services.file_last_commits import compute_and_store_file_last_commits
264
265 repo = await create_repo(db_session)
266 branch = await create_branch(db_session, repo.repo_id, name="main")
267
268 commit = await _add_commit(db_session, repo.repo_id, "c1", "s1", {"a.py": _obj_id("a")})
269 branch.head_commit_id = commit.commit_id
270 await db_session.flush()
271
272 await compute_and_store_file_last_commits(db_session, repo.repo_id, "main", commit.commit_id)
273 await compute_and_store_file_last_commits(db_session, repo.repo_id, "main", commit.commit_id)
274 await db_session.flush()
275
276 rows = (await db_session.execute(
277 select(MusehubFileLastCommit).where(
278 MusehubFileLastCommit.repo_id == repo.repo_id,
279 MusehubFileLastCommit.branch == "main",
280 MusehubFileLastCommit.path == "a.py",
281 )
282 )).scalars().all()
283 assert len(rows) == 1
284
285
286 # ---------------------------------------------------------------------------
287 # FLC8 — only requested paths returned
288 # ---------------------------------------------------------------------------
289
290 @pytest.mark.asyncio
291 async def test_flc8_only_requested_paths_returned(db_session: AsyncSession) -> None:
292 """FLC8: get_file_last_commits returns only the paths asked for."""
293 from musehub.services.file_last_commits import compute_and_store_file_last_commits
294 from musehub.services.musehub_repository import get_file_last_commits
295
296 repo = await create_repo(db_session)
297 branch = await create_branch(db_session, repo.repo_id, name="main")
298
299 manifest = {"a.py": _obj_id("a"), "b.py": _obj_id("b"), "c.py": _obj_id("c")}
300 commit = await _add_commit(db_session, repo.repo_id, "c1", "s1", manifest)
301 branch.head_commit_id = commit.commit_id
302 await db_session.flush()
303
304 await compute_and_store_file_last_commits(db_session, repo.repo_id, "main", commit.commit_id)
305 await db_session.flush()
306
307 result = await get_file_last_commits(db_session, repo.repo_id, ["a.py"], ref="main")
308 assert set(result.keys()) == {"a.py"}
309
310
311 # ---------------------------------------------------------------------------
312 # FLC9 — unknown paths return no entry
313 # ---------------------------------------------------------------------------
314
315 @pytest.mark.asyncio
316 async def test_flc9_unknown_paths_not_in_result(db_session: AsyncSession) -> None:
317 """FLC9: paths not in any snapshot are silently absent from result."""
318 from musehub.services.file_last_commits import compute_and_store_file_last_commits
319 from musehub.services.musehub_repository import get_file_last_commits
320
321 repo = await create_repo(db_session)
322 branch = await create_branch(db_session, repo.repo_id, name="main")
323
324 commit = await _add_commit(db_session, repo.repo_id, "c1", "s1", {"real.py": _obj_id("r")})
325 branch.head_commit_id = commit.commit_id
326 await db_session.flush()
327
328 await compute_and_store_file_last_commits(db_session, repo.repo_id, "main", commit.commit_id)
329 await db_session.flush()
330
331 result = await get_file_last_commits(db_session, repo.repo_id, ["ghost.py"], ref="main")
332 assert "ghost.py" not in result
333
334
335 # ---------------------------------------------------------------------------
336 # FLC10 — second push updates changed files
337 # ---------------------------------------------------------------------------
338
339 @pytest.mark.asyncio
340 async def test_flc10_second_push_updates_changed_files(db_session: AsyncSession) -> None:
341 """FLC10: after a second push, files that changed point to the new commit."""
342 from musehub.services.file_last_commits import compute_and_store_file_last_commits
343 from musehub.services.musehub_repository import get_file_last_commits
344
345 repo = await create_repo(db_session)
346 branch = await create_branch(db_session, repo.repo_id, name="main")
347
348 m1 = {"README.md": _obj_id("readme-v1")}
349 m2 = {"README.md": _obj_id("readme-v2")}
350
351 c1 = await _add_commit(db_session, repo.repo_id, "c1", "s1", m1, ts_offset=-1)
352 branch.head_commit_id = c1.commit_id
353 await db_session.flush()
354 await compute_and_store_file_last_commits(db_session, repo.repo_id, "main", c1.commit_id)
355 await db_session.flush()
356
357 c2 = await _add_commit(db_session, repo.repo_id, "c2", "s2", m2, ts_offset=0)
358 branch.head_commit_id = c2.commit_id
359 await db_session.flush()
360 await compute_and_store_file_last_commits(db_session, repo.repo_id, "main", c2.commit_id)
361 await db_session.flush()
362
363 result = await get_file_last_commits(db_session, repo.repo_id, ["README.md"], ref="main")
364 assert result["README.md"]["sha"] == c2.commit_id
365
366
367 # ---------------------------------------------------------------------------
368 # FLC11 — more than 100 commits does not raise
369 # ---------------------------------------------------------------------------
370
371 @pytest.mark.asyncio
372 async def test_flc11_over_100_commits_does_not_raise(db_session: AsyncSession) -> None:
373 """FLC11: repos with >100 commits must not raise ValueError from batch limit."""
374 from musehub.services.file_last_commits import compute_and_store_file_last_commits
375 from musehub.services.musehub_repository import get_file_last_commits
376
377 repo = await create_repo(db_session)
378 branch = await create_branch(db_session, repo.repo_id, name="main")
379
380 n = 150
381 last_commit = None
382 for i in range(n):
383 manifest = {"src/app.py": _obj_id(f"app-v{i}")}
384 last_commit = await _add_commit(
385 db_session, repo.repo_id, f"c{i}", f"s{i}", manifest, ts_offset=i - n
386 )
387 branch.head_commit_id = last_commit.commit_id # type: ignore[union-attr]
388 await db_session.flush()
389
390 # Must not raise ValueError: batch size N exceeds limit 100
391 await compute_and_store_file_last_commits(
392 db_session, repo.repo_id, "main", last_commit.commit_id # type: ignore[union-attr]
393 )
394 await db_session.flush()
395
396 result = await get_file_last_commits(db_session, repo.repo_id, ["src/app.py"], ref="main")
397 assert "src/app.py" in result
File History 1 commit
sha256:009b5a222314f47640a58d75ce5a1f428f1624cf0b51384dfcdfbdfab3cc42a4 feat: migration idempotency, file attribution DAG walk, mpa… Sonnet 4.6 minor 15 days ago