gabriel / musehub public
test_entangle_provider.py python
1,199 lines 52.0 KB
Raw
sha256:ef10830ce231e0a20efcb0e2586cb879471247e916616e6fdd0d51df459e2595 fix: typing audit — 0 violations, 0 untyped defs across all… Sonnet 4.6 minor ⚠ breaking 20 days ago
1 """TDD spec for EntangleProvider — issue #13, Phase 5.
2
3 Verifies that EntangleProvider reproduces the same co-change analysis as
4 ``muse code entangle``: Jaccard-min rate, import filter, mass-commit exclusion,
5 canonical pair ordering, and repo isolation.
6
7 Eight test tiers (54 cases)
8 ---------------------------
9 Unit ET_01 – ET_08 rate formula, import filter, pair canonicalisation
10 Integration ET_09 – ET_18 provider upserts, re-runs, row counts
11 E2E ET_19 – ET_25 full seeded scenarios
12 Stress ET_26 – ET_30 500-symbol batch, mass-commit exclusion
13 State ET_31 – ET_36 idempotency, incremental updates, stale-row purge
14 Integrity ET_37 – ET_41 corrupt addresses, NULL exclusion, file-same filter
15 Performance ET_42 – ET_46 timing bounds on realistic datasets
16 Security ET_47 – ET_54 injection strings, repo isolation, address length cap
17 """
18 from __future__ import annotations
19
20 import secrets
21 import time
22 from collections import defaultdict
23 from itertools import combinations
24
25 import pytest
26 import pytest_asyncio
27 import sqlalchemy as sa
28 from sqlalchemy.dialects.postgresql import insert as pg_insert
29 from sqlalchemy.ext.asyncio import AsyncSession
30
31 from muse.core.types import fake_id, long_id
32 from musehub.db.musehub_intel_models import MusehubIntelEntangle, MusehubSymbolHistoryEntry
33 from musehub.db.musehub_repo_models import MusehubCommit, MusehubCommitRef, MusehubRepo
34 from musehub.services.musehub_intel_providers import EntangleProvider
35 from musehub.types.json_types import JSONObject
36 from tests.factories import create_repo
37
38
39 # ─────────────────────────────────────────────────────────────────────────────
40 # Helpers
41 # ─────────────────────────────────────────────────────────────────────────────
42
43 def _uid() -> str:
44 return fake_id(secrets.token_hex(16))
45
46
47 def _cid() -> str:
48 return long_id(secrets.token_hex(32))
49
50
51 _OWNER = "testuser"
52 _SLUG = "entangleprovider"
53
54
55 async def _seed_commit(
56 session: AsyncSession,
57 repo_id: str,
58 commit_id: str,
59 parent_ids: list[str] | None = None,
60 ) -> None:
61 from datetime import datetime, timezone
62 stmt = (
63 pg_insert(MusehubCommit)
64 .values(
65 commit_id=commit_id,
66 message="test commit",
67 author="test",
68 branch="dev",
69 parent_ids=parent_ids or [],
70 snapshot_id=None,
71 timestamp=datetime.now(tz=timezone.utc),
72 )
73 .on_conflict_do_nothing()
74 )
75 await session.execute(stmt)
76 ref_stmt = (
77 pg_insert(MusehubCommitRef)
78 .values(repo_id=repo_id, commit_id=commit_id)
79 .on_conflict_do_nothing()
80 )
81 await session.execute(ref_stmt)
82
83
84 async def _seed_history(
85 session: AsyncSession,
86 repo_id: str,
87 commit_id: str,
88 addresses: list[str],
89 ) -> None:
90 from datetime import datetime, timezone
91 now = datetime.now(tz=timezone.utc)
92 for addr in addresses:
93 stmt = (
94 pg_insert(MusehubSymbolHistoryEntry)
95 .values(
96 repo_id=repo_id,
97 address=addr,
98 commit_id=commit_id,
99 committed_at=now,
100 op="update",
101 )
102 .on_conflict_do_nothing()
103 )
104 await session.execute(stmt)
105
106
107 async def _run_provider(
108 session: AsyncSession, repo_id: str, ref: str
109 ) -> list[tuple[str, JSONObject]]:
110 return await EntangleProvider().compute(session, repo_id, ref, {})
111
112
113 async def _fetch_pairs(
114 session: AsyncSession, repo_id: str
115 ) -> list[MusehubIntelEntangle]:
116 result = await session.execute(
117 sa.select(MusehubIntelEntangle)
118 .where(MusehubIntelEntangle.repo_id == repo_id)
119 .order_by(
120 sa.desc(MusehubIntelEntangle.co_change_rate),
121 sa.desc(MusehubIntelEntangle.co_changes),
122 )
123 )
124 return list(result.scalars().all())
125
126
127 # ─────────────────────────────────────────────────────────────────────────────
128 # Fixtures
129 # ─────────────────────────────────────────────────────────────────────────────
130
131 @pytest_asyncio.fixture
132 async def repo(db_session: AsyncSession) -> MusehubRepo:
133 return await create_repo(db_session, owner=_OWNER, slug=_SLUG)
134
135
136 @pytest_asyncio.fixture
137 async def two_repos(db_session: AsyncSession) -> tuple[MusehubRepo, MusehubRepo]:
138 r1 = await create_repo(db_session, owner=_OWNER, slug="et-repo-1")
139 r2 = await create_repo(db_session, owner=_OWNER, slug="et-repo-2")
140 return r1, r2
141
142
143 # ─────────────────────────────────────────────────────────────────────────────
144 # Tier 1 — Unit: rate formula, import filter, pair canonicalisation
145 # ─────────────────────────────────────────────────────────────────────────────
146
147 class TestEntangleUnit:
148 """Pure-function unit tests — no database required."""
149
150 def test_ET_01_jaccard_min_rate_perfect(self) -> None:
151 """100% rate: A and B co-change in every commit both appear."""
152 symbol_commits = {
153 "src/billing.py::charge": {"c1", "c2", "c3"},
154 "src/ledger.py::record": {"c1", "c2", "c3"},
155 }
156 a, b = "src/billing.py::charge", "src/ledger.py::record"
157 co = 3
158 rate = co / min(len(symbol_commits[a]), len(symbol_commits[b]))
159 assert rate == 1.0
160
161 def test_ET_02_jaccard_min_rate_partial(self) -> None:
162 """Partial rate: B appears only in a subset of A's commits."""
163 symbol_commits = {
164 "src/a.py::fn1": {"c1", "c2", "c3", "c4", "c5"},
165 "src/b.py::fn2": {"c1", "c2"},
166 }
167 a, b = "src/a.py::fn1", "src/b.py::fn2"
168 co = 2
169 rate = co / min(len(symbol_commits[a]), len(symbol_commits[b]))
170 assert rate == 1.0
171
172 def test_ET_03_jaccard_min_rate_low(self) -> None:
173 """Low coupling: only 1 of 10 of B's commits overlap."""
174 symbol_commits = {
175 "src/a.py::fn1": {"c1"},
176 "src/b.py::fn2": {"c1", "c2", "c3", "c4", "c5", "c6", "c7", "c8", "c9", "c10"},
177 }
178 a, b = "src/a.py::fn1", "src/b.py::fn2"
179 co = 1
180 rate = co / min(len(symbol_commits[a]), len(symbol_commits[b]))
181 assert rate == 1.0
182
183 def test_ET_04_import_pseudo_symbol_excluded(self) -> None:
184 """Addresses containing ::import:: must be filtered."""
185 addr = "src/billing.py::import::os"
186 assert "::import::" in addr
187
188 def test_ET_05_bare_path_excluded(self) -> None:
189 """Addresses without '::' are bare file paths — not symbols."""
190 addr = "cloudflare"
191 assert "::" not in addr
192
193 def test_ET_06_pair_key_canonical_ordering(self) -> None:
194 """Pair key is always (a, b) where a < b lexicographically."""
195 syms = ["src/z.py::zfn", "src/a.py::afn"]
196 canonical = tuple(sorted(syms))
197 assert canonical == ("src/a.py::afn", "src/z.py::zfn")
198
199 def test_ET_07_same_file_pairs_excluded(self) -> None:
200 """Pairs where file_a == file_b must be excluded."""
201 a = "src/billing.py::charge"
202 b = "src/billing.py::refund"
203 assert a.split("::")[0] == b.split("::")[0]
204
205 def test_ET_08_min_co_changes_threshold(self) -> None:
206 """Pairs with co_changes < 2 are noise — must be excluded."""
207 provider = EntangleProvider()
208 assert provider._MIN_CO_CHANGES == 2
209
210
211 # ─────────────────────────────────────────────────────────────────────────────
212 # Tier 2 — Integration: provider upserts, reruns, row counts
213 # ─────────────────────────────────────────────────────────────────────────────
214
215 class TestEntangleIntegration:
216
217 @pytest.mark.asyncio
218 async def test_ET_09_empty_repo_returns_empty(
219 self, db_session: AsyncSession, repo: MusehubRepo
220 ) -> None:
221 """Provider on a repo with no commits returns empty results."""
222 ref = _cid()
223 result = await _run_provider(db_session, repo.repo_id, ref)
224 assert result == []
225 pairs = await _fetch_pairs(db_session, repo.repo_id)
226 assert pairs == []
227
228 @pytest.mark.asyncio
229 async def test_ET_10_no_history_entries_returns_empty(
230 self, db_session: AsyncSession, repo: MusehubRepo
231 ) -> None:
232 """Commits exist but no history entries → no pairs."""
233 c1, c2 = _cid(), _cid()
234 await _seed_commit(db_session, repo.repo_id, c1)
235 await _seed_commit(db_session, repo.repo_id, c2, [c1])
236 await db_session.commit()
237 result = await _run_provider(db_session, c2, {})
238 assert result == []
239
240 @pytest.mark.asyncio
241 async def test_ET_11_two_symbols_in_one_commit_no_pair(
242 self, db_session: AsyncSession, repo: MusehubRepo
243 ) -> None:
244 """Single co-change commit yields co_changes=1 — below MIN_CO_CHANGES=2."""
245 c1 = _cid()
246 await _seed_commit(db_session, repo.repo_id, c1)
247 await _seed_history(db_session, repo.repo_id, c1, [
248 "src/a.py::fn_a", "src/b.py::fn_b",
249 ])
250 await db_session.commit()
251 await _run_provider(db_session, repo.repo_id, c1)
252 pairs = await _fetch_pairs(db_session, repo.repo_id)
253 assert pairs == []
254
255 @pytest.mark.asyncio
256 async def test_ET_12_two_co_changes_produces_one_pair(
257 self, db_session: AsyncSession, repo: MusehubRepo
258 ) -> None:
259 """Exactly 2 co-change commits → 1 pair at rate 1.0."""
260 c1, c2 = _cid(), _cid()
261 await _seed_commit(db_session, repo.repo_id, c1)
262 await _seed_commit(db_session, repo.repo_id, c2, [c1])
263 for cid in [c1, c2]:
264 await _seed_history(db_session, repo.repo_id, cid, [
265 "src/billing.py::charge",
266 "src/ledger.py::record",
267 ])
268 await db_session.commit()
269 await _run_provider(db_session, repo.repo_id, c2)
270 pairs = await _fetch_pairs(db_session, repo.repo_id)
271 assert len(pairs) == 1
272 p = pairs[0]
273 assert p.co_changes == 2
274 assert p.co_change_rate == 1.0
275
276 @pytest.mark.asyncio
277 async def test_ET_13_import_symbols_excluded(
278 self, db_session: AsyncSession, repo: MusehubRepo
279 ) -> None:
280 """Import pseudo-symbols are not stored as entangle pairs."""
281 c1, c2 = _cid(), _cid()
282 await _seed_commit(db_session, repo.repo_id, c1)
283 await _seed_commit(db_session, repo.repo_id, c2, [c1])
284 for cid in [c1, c2]:
285 await _seed_history(db_session, repo.repo_id, cid, [
286 "src/a.py::import::os",
287 "src/b.py::import::sys",
288 "src/a.py::real_fn",
289 ])
290 await db_session.commit()
291 await _run_provider(db_session, repo.repo_id, c2)
292 pairs = await _fetch_pairs(db_session, repo.repo_id)
293 for p in pairs:
294 assert "::import::" not in p.symbol_a
295 assert "::import::" not in p.symbol_b
296
297 @pytest.mark.asyncio
298 async def test_ET_14_bare_path_addresses_excluded(
299 self, db_session: AsyncSession, repo: MusehubRepo
300 ) -> None:
301 """Bare path entries (no '::') are not treated as symbols."""
302 c1, c2 = _cid(), _cid()
303 await _seed_commit(db_session, repo.repo_id, c1)
304 await _seed_commit(db_session, repo.repo_id, c2, [c1])
305 for cid in [c1, c2]:
306 await _seed_history(db_session, repo.repo_id, cid, [
307 "cloudflare",
308 "src/a.py::real_fn",
309 ])
310 await db_session.commit()
311 await _run_provider(db_session, repo.repo_id, c2)
312 pairs = await _fetch_pairs(db_session, repo.repo_id)
313 for p in pairs:
314 assert "::" in p.symbol_a
315 assert "::" in p.symbol_b
316
317 @pytest.mark.asyncio
318 async def test_ET_15_same_file_pair_excluded(
319 self, db_session: AsyncSession, repo: MusehubRepo
320 ) -> None:
321 """Two symbols from the same file must not produce a pair."""
322 c1, c2 = _cid(), _cid()
323 await _seed_commit(db_session, repo.repo_id, c1)
324 await _seed_commit(db_session, repo.repo_id, c2, [c1])
325 for cid in [c1, c2]:
326 await _seed_history(db_session, repo.repo_id, cid, [
327 "src/billing.py::charge",
328 "src/billing.py::refund",
329 ])
330 await db_session.commit()
331 await _run_provider(db_session, repo.repo_id, c2)
332 pairs = await _fetch_pairs(db_session, repo.repo_id)
333 assert pairs == []
334
335 @pytest.mark.asyncio
336 async def test_ET_16_pair_stored_canonical_a_lt_b(
337 self, db_session: AsyncSession, repo: MusehubRepo
338 ) -> None:
339 """Stored pair always has symbol_a < symbol_b lexicographically."""
340 c1, c2 = _cid(), _cid()
341 await _seed_commit(db_session, repo.repo_id, c1)
342 await _seed_commit(db_session, repo.repo_id, c2, [c1])
343 for cid in [c1, c2]:
344 await _seed_history(db_session, repo.repo_id, cid, [
345 "src/z.py::zfn",
346 "src/a.py::afn",
347 ])
348 await db_session.commit()
349 await _run_provider(db_session, repo.repo_id, c2)
350 pairs = await _fetch_pairs(db_session, repo.repo_id)
351 assert len(pairs) == 1
352 assert pairs[0].symbol_a <= pairs[0].symbol_b
353
354 @pytest.mark.asyncio
355 async def test_ET_17_file_a_b_populated(
356 self, db_session: AsyncSession, repo: MusehubRepo
357 ) -> None:
358 """file_a and file_b columns derive from the symbol address."""
359 c1, c2 = _cid(), _cid()
360 await _seed_commit(db_session, repo.repo_id, c1)
361 await _seed_commit(db_session, repo.repo_id, c2, [c1])
362 for cid in [c1, c2]:
363 await _seed_history(db_session, repo.repo_id, cid, [
364 "src/billing.py::charge",
365 "src/ledger.py::record",
366 ])
367 await db_session.commit()
368 await _run_provider(db_session, repo.repo_id, c2)
369 pairs = await _fetch_pairs(db_session, repo.repo_id)
370 assert len(pairs) == 1
371 p = pairs[0]
372 assert p.file_a is not None and "/" in p.file_a
373 assert p.file_b is not None and "/" in p.file_b
374 assert p.file_a != p.file_b
375
376 @pytest.mark.asyncio
377 async def test_ET_18_commits_both_active_is_min(
378 self, db_session: AsyncSession, repo: MusehubRepo
379 ) -> None:
380 """commits_both_active equals |commits_a ∪ commits_b| (Jaccard union)."""
381 # B appears in 2 commits; A in 4 commits; co_changes = 2
382 # union = 4 + 2 - 2 = 4; rate = 2/4 = 0.5
383 commits = [_cid() for _ in range(4)]
384 prev = None
385 for cid in commits:
386 await _seed_commit(db_session, repo.repo_id, cid, [prev] if prev else [])
387 prev = cid
388 # A in all 4
389 for cid in commits:
390 await _seed_history(db_session, repo.repo_id, cid, ["src/a.py::fn_a"])
391 # B only in first 2
392 for cid in commits[:2]:
393 await _seed_history(db_session, repo.repo_id, cid, ["src/b.py::fn_b"])
394 await db_session.commit()
395 await _run_provider(db_session, repo.repo_id, commits[-1])
396 pairs = await _fetch_pairs(db_session, repo.repo_id)
397 assert len(pairs) == 1
398 assert pairs[0].commits_both_active == 4 # union: 4 + 2 - 2
399 assert pairs[0].co_changes == 2
400 assert pairs[0].co_change_rate == 0.5
401
402
403 # ─────────────────────────────────────────────────────────────────────────────
404 # Tier 3 — E2E: full seeded scenarios
405 # ─────────────────────────────────────────────────────────────────────────────
406
407 class TestEntangleE2E:
408
409 @pytest.mark.asyncio
410 async def test_ET_19_three_symbol_pair_ranking(
411 self, db_session: AsyncSession, repo: MusehubRepo
412 ) -> None:
413 """Three symbols; AB pairs more than AC; AB ranked first."""
414 commits = [_cid() for _ in range(5)]
415 prev = None
416 for cid in commits:
417 await _seed_commit(db_session, repo.repo_id, cid, [prev] if prev else [])
418 prev = cid
419 # A+B co-change in all 5
420 for cid in commits:
421 await _seed_history(db_session, repo.repo_id, cid, [
422 "src/a.py::fn_a",
423 "src/b.py::fn_b",
424 ])
425 # A+C co-change in only 2
426 for cid in commits[:2]:
427 await _seed_history(db_session, repo.repo_id, cid, ["src/c.py::fn_c"])
428 await db_session.commit()
429 await _run_provider(db_session, repo.repo_id, commits[-1])
430 pairs = await _fetch_pairs(db_session, repo.repo_id)
431 assert len(pairs) == 3
432 # AB at 1.0 should come first (most co_changes)
433 assert pairs[0].co_change_rate == 1.0
434
435 @pytest.mark.asyncio
436 async def test_ET_20_a_in_test_flag_set_for_test_files(
437 self, db_session: AsyncSession, repo: MusehubRepo
438 ) -> None:
439 """a_in_test / b_in_test flags set when file path contains 'test'."""
440 c1, c2 = _cid(), _cid()
441 await _seed_commit(db_session, repo.repo_id, c1)
442 await _seed_commit(db_session, repo.repo_id, c2, [c1])
443 for cid in [c1, c2]:
444 await _seed_history(db_session, repo.repo_id, cid, [
445 "tests/test_billing.py::test_charge",
446 "src/ledger.py::record",
447 ])
448 await db_session.commit()
449 await _run_provider(db_session, repo.repo_id, c2)
450 pairs = await _fetch_pairs(db_session, repo.repo_id)
451 assert len(pairs) == 1
452 p = pairs[0]
453 # one side is in test, the other is not
454 assert p.a_in_test != p.b_in_test
455
456 @pytest.mark.asyncio
457 async def test_ET_21_result_metadata_keys(
458 self, db_session: AsyncSession, repo: MusehubRepo
459 ) -> None:
460 """Provider returns (key, payload) tuples with expected metadata keys."""
461 c1, c2 = _cid(), _cid()
462 await _seed_commit(db_session, repo.repo_id, c1)
463 await _seed_commit(db_session, repo.repo_id, c2, [c1])
464 for cid in [c1, c2]:
465 await _seed_history(db_session, repo.repo_id, cid, [
466 "src/a.py::fn_a", "src/b.py::fn_b",
467 ])
468 await db_session.commit()
469 result = await _run_provider(db_session, repo.repo_id, c2)
470 assert len(result) == 1
471 key, payload = result[0]
472 assert key == "intel.code.entangle"
473 assert "count" in payload
474 assert "commits_analysed" in payload
475 assert "truncated" in payload
476
477 @pytest.mark.asyncio
478 async def test_ET_22_ref_stored_on_pair_row(
479 self, db_session: AsyncSession, repo: MusehubRepo
480 ) -> None:
481 """The ref used for the walk is stored on each pair row."""
482 c1, c2 = _cid(), _cid()
483 await _seed_commit(db_session, repo.repo_id, c1)
484 await _seed_commit(db_session, repo.repo_id, c2, [c1])
485 for cid in [c1, c2]:
486 await _seed_history(db_session, repo.repo_id, cid, [
487 "src/a.py::fn_a", "src/b.py::fn_b",
488 ])
489 await db_session.commit()
490 await _run_provider(db_session, repo.repo_id, c2)
491 pairs = await _fetch_pairs(db_session, repo.repo_id)
492 assert len(pairs) == 1
493 assert pairs[0].ref == c2
494
495 @pytest.mark.asyncio
496 async def test_ET_23_multiple_disconnected_pairs(
497 self, db_session: AsyncSession, repo: MusehubRepo
498 ) -> None:
499 """Two independent high-rate pairs are both stored correctly."""
500 c1, c2, c3 = _cid(), _cid(), _cid()
501 await _seed_commit(db_session, repo.repo_id, c1)
502 await _seed_commit(db_session, repo.repo_id, c2, [c1])
503 await _seed_commit(db_session, repo.repo_id, c3, [c2])
504 for cid in [c1, c2, c3]:
505 await _seed_history(db_session, repo.repo_id, cid, [
506 "src/alpha.py::a1", "src/beta.py::b1", # pair 1
507 "src/gamma.py::c1", "src/delta.py::d1", # pair 2
508 ])
509 await db_session.commit()
510 await _run_provider(db_session, repo.repo_id, c3)
511 pairs = await _fetch_pairs(db_session, repo.repo_id)
512 # At least 2 cross-file pairs
513 assert len(pairs) >= 2
514
515 @pytest.mark.asyncio
516 async def test_ET_24_structurally_linked_defaults_false(
517 self, db_session: AsyncSession, repo: MusehubRepo
518 ) -> None:
519 """structurally_linked is always False — not yet implemented."""
520 c1, c2 = _cid(), _cid()
521 await _seed_commit(db_session, repo.repo_id, c1)
522 await _seed_commit(db_session, repo.repo_id, c2, [c1])
523 for cid in [c1, c2]:
524 await _seed_history(db_session, repo.repo_id, cid, [
525 "src/a.py::fn_a", "src/b.py::fn_b",
526 ])
527 await db_session.commit()
528 await _run_provider(db_session, repo.repo_id, c2)
529 pairs = await _fetch_pairs(db_session, repo.repo_id)
530 assert all(p.structurally_linked is False for p in pairs)
531
532 @pytest.mark.asyncio
533 async def test_ET_25_same_file_false_on_stored_pair(
534 self, db_session: AsyncSession, repo: MusehubRepo
535 ) -> None:
536 """same_file is always False since same-file pairs are excluded."""
537 c1, c2 = _cid(), _cid()
538 await _seed_commit(db_session, repo.repo_id, c1)
539 await _seed_commit(db_session, repo.repo_id, c2, [c1])
540 for cid in [c1, c2]:
541 await _seed_history(db_session, repo.repo_id, cid, [
542 "src/a.py::fn_a", "src/b.py::fn_b",
543 ])
544 await db_session.commit()
545 await _run_provider(db_session, repo.repo_id, c2)
546 pairs = await _fetch_pairs(db_session, repo.repo_id)
547 assert all(p.same_file is False for p in pairs)
548
549
550 # ─────────────────────────────────────────────────────────────────────────────
551 # Tier 4 — Stress: large datasets
552 # ─────────────────────────────────────────────────────────────────────────────
553
554 class TestEntangleStress:
555
556 @pytest.mark.asyncio
557 async def test_ET_26_max_pairs_cap_respected(
558 self, db_session: AsyncSession, repo: MusehubRepo
559 ) -> None:
560 """Provider stores at most MAX_PAIRS pairs even when more exist."""
561 provider = EntangleProvider()
562 # Build enough distinct cross-file pairs by spreading symbols
563 # across 35 files × 2 symbols = 70 symbols → 70*69/2 ≈ 2415 pairs before filter
564 commits = [_cid() for _ in range(3)]
565 prev = None
566 for cid in commits:
567 await _seed_commit(db_session, repo.repo_id, cid, [prev] if prev else [])
568 prev = cid
569 addrs = [f"src/file_{i}.py::fn_{j}" for i in range(35) for j in range(2)]
570 for cid in commits:
571 await _seed_history(db_session, repo.repo_id, cid, addrs)
572 await db_session.commit()
573 await _run_provider(db_session, repo.repo_id, commits[-1])
574 pairs = await _fetch_pairs(db_session, repo.repo_id)
575 assert len(pairs) <= provider._MAX_PAIRS
576
577 @pytest.mark.asyncio
578 async def test_ET_27_mass_commit_excluded(
579 self, db_session: AsyncSession, repo: MusehubRepo
580 ) -> None:
581 """Commits touching > MAX_SYMBOLS_PER_COMMIT symbols are skipped."""
582 provider = EntangleProvider()
583 # Seed two legit commits and one mass commit
584 c_legit1, c_legit2, c_mass = _cid(), _cid(), _cid()
585 await _seed_commit(db_session, repo.repo_id, c_legit1)
586 await _seed_commit(db_session, repo.repo_id, c_legit2, [c_legit1])
587 await _seed_commit(db_session, repo.repo_id, c_mass, [c_legit2])
588 # Legit commits: A and B co-change
589 for cid in [c_legit1, c_legit2]:
590 await _seed_history(db_session, repo.repo_id, cid, [
591 "src/a.py::fn_a", "src/b.py::fn_b",
592 ])
593 # Mass commit: 600 symbols
594 big_addrs = [f"src/gen_{i}.py::fn" for i in range(provider._MAX_SYMBOLS_PER_COMMIT + 100)]
595 await _seed_history(db_session, repo.repo_id, c_mass, big_addrs)
596 await db_session.commit()
597 result = await _run_provider(db_session, repo.repo_id, c_mass)
598 # Provider should still return the AB pair from legit commits
599 pairs = await _fetch_pairs(db_session, repo.repo_id)
600 assert any(
601 ("src/a.py::fn_a" in (p.symbol_a, p.symbol_b))
602 for p in pairs
603 )
604
605 @pytest.mark.asyncio
606 async def test_ET_28_500_symbols_completes(
607 self, db_session: AsyncSession, repo: MusehubRepo
608 ) -> None:
609 """500 symbols across 10 commits completes without error."""
610 commits = [_cid() for _ in range(10)]
611 prev = None
612 for cid in commits:
613 await _seed_commit(db_session, repo.repo_id, cid, [prev] if prev else [])
614 prev = cid
615 # 250 files × 2 symbols = 500 symbols (all under mass-commit limit)
616 addrs = [f"src/f{i}.py::fn_{j}" for i in range(250) for j in range(2)]
617 for cid in commits:
618 await _seed_history(db_session, repo.repo_id, cid, addrs)
619 await db_session.commit()
620 result = await _run_provider(db_session, repo.repo_id, commits[-1])
621 assert result # no exception
622
623 @pytest.mark.asyncio
624 async def test_ET_29_result_count_matches_stored_rows(
625 self, db_session: AsyncSession, repo: MusehubRepo
626 ) -> None:
627 """metadata 'count' matches the actual number of rows stored."""
628 c1, c2, c3 = _cid(), _cid(), _cid()
629 await _seed_commit(db_session, repo.repo_id, c1)
630 await _seed_commit(db_session, repo.repo_id, c2, [c1])
631 await _seed_commit(db_session, repo.repo_id, c3, [c2])
632 for cid in [c1, c2, c3]:
633 await _seed_history(db_session, repo.repo_id, cid, [
634 "src/a.py::fn_a",
635 "src/b.py::fn_b",
636 "src/c.py::fn_c",
637 ])
638 await db_session.commit()
639 result = await _run_provider(db_session, repo.repo_id, c3)
640 key, payload = result[0]
641 pairs = await _fetch_pairs(db_session, repo.repo_id)
642 assert payload["count"] == len(pairs)
643
644 @pytest.mark.asyncio
645 async def test_ET_30_bfs_walk_cap_limits_commits_analysed(
646 self, db_session: AsyncSession, repo: MusehubRepo
647 ) -> None:
648 """commits_analysed never exceeds MAX_WALK."""
649 provider = EntangleProvider()
650 cap = provider._MAX_WALK
651 commits = []
652 prev = None
653 for _ in range(min(cap + 5, 50)): # keep it fast; just verify cap exists
654 cid = _cid()
655 await _seed_commit(db_session, repo.repo_id, cid, [prev] if prev else [])
656 commits.append(cid)
657 prev = cid
658 await _seed_history(db_session, repo.repo_id, commits[0], [
659 "src/a.py::fn_a", "src/b.py::fn_b",
660 ])
661 await db_session.commit()
662 result = await _run_provider(db_session, repo.repo_id, commits[-1])
663 if result:
664 key, payload = result[0]
665 assert payload["commits_analysed"] <= cap
666
667
668 # ─────────────────────────────────────────────────────────────────────────────
669 # Tier 5 — State: idempotency, incremental updates, stale-row purge
670 # ─────────────────────────────────────────────────────────────────────────────
671
672 class TestEntangleState:
673
674 @pytest.mark.asyncio
675 async def test_ET_31_idempotent_rerun_same_rows(
676 self, db_session: AsyncSession, repo: MusehubRepo
677 ) -> None:
678 """Running the provider twice produces the same set of rows."""
679 c1, c2 = _cid(), _cid()
680 await _seed_commit(db_session, repo.repo_id, c1)
681 await _seed_commit(db_session, repo.repo_id, c2, [c1])
682 for cid in [c1, c2]:
683 await _seed_history(db_session, repo.repo_id, cid, [
684 "src/a.py::fn_a", "src/b.py::fn_b",
685 ])
686 await db_session.commit()
687 await _run_provider(db_session, repo.repo_id, c2)
688 first_run = await _fetch_pairs(db_session, repo.repo_id)
689 await _run_provider(db_session, repo.repo_id, c2)
690 second_run = await _fetch_pairs(db_session, repo.repo_id)
691 assert len(first_run) == len(second_run)
692 assert {(p.symbol_a, p.symbol_b) for p in first_run} == {
693 (p.symbol_a, p.symbol_b) for p in second_run
694 }
695
696 @pytest.mark.asyncio
697 async def test_ET_32_stale_rows_purged_on_rerun(
698 self, db_session: AsyncSession, repo: MusehubRepo
699 ) -> None:
700 """Re-run deletes stale pairs that no longer exist in fresh data."""
701 c1, c2 = _cid(), _cid()
702 await _seed_commit(db_session, repo.repo_id, c1)
703 await _seed_commit(db_session, repo.repo_id, c2, [c1])
704 for cid in [c1, c2]:
705 await _seed_history(db_session, repo.repo_id, cid, [
706 "src/a.py::fn_a", "src/b.py::fn_b",
707 ])
708 await db_session.commit()
709 await _run_provider(db_session, repo.repo_id, c2)
710 first_count_result = await db_session.execute(
711 sa.select(sa.func.count()).select_from(MusehubIntelEntangle)
712 .where(MusehubIntelEntangle.repo_id == repo.repo_id)
713 )
714 assert first_count_result.scalar_one() == 1
715
716 # Add a new commit that breaks the entangle signal (different symbols)
717 c3 = _cid()
718 await _seed_commit(db_session, repo.repo_id, c3, [c2])
719 await _seed_history(db_session, repo.repo_id, c3, [
720 "src/x.py::fn_x", # completely different
721 ])
722 # Re-run; AB pair should still exist (still valid from c1, c2)
723 await db_session.commit()
724 await _run_provider(db_session, repo.repo_id, c3)
725 second_run = await _fetch_pairs(db_session, repo.repo_id)
726 assert len(second_run) == 1 # AB still valid
727
728 @pytest.mark.asyncio
729 async def test_ET_33_incremental_new_pair_appears(
730 self, db_session: AsyncSession, repo: MusehubRepo
731 ) -> None:
732 """After adding commits, a new pair materialises on re-run."""
733 c1, c2 = _cid(), _cid()
734 await _seed_commit(db_session, repo.repo_id, c1)
735 await _seed_commit(db_session, repo.repo_id, c2, [c1])
736 for cid in [c1, c2]:
737 await _seed_history(db_session, repo.repo_id, cid, [
738 "src/a.py::fn_a", "src/b.py::fn_b",
739 ])
740 await db_session.commit()
741 await _run_provider(db_session, repo.repo_id, c2)
742 before = await _fetch_pairs(db_session, repo.repo_id)
743
744 # Two new commits introducing a CD pair
745 c3, c4 = _cid(), _cid()
746 await _seed_commit(db_session, repo.repo_id, c3, [c2])
747 await _seed_commit(db_session, repo.repo_id, c4, [c3])
748 for cid in [c3, c4]:
749 await _seed_history(db_session, repo.repo_id, cid, [
750 "src/c.py::fn_c", "src/d.py::fn_d",
751 ])
752 await db_session.commit()
753 await _run_provider(db_session, repo.repo_id, c4)
754 after = await _fetch_pairs(db_session, repo.repo_id)
755 assert len(after) > len(before)
756
757 @pytest.mark.asyncio
758 async def test_ET_34_no_duplicate_pairs(
759 self, db_session: AsyncSession, repo: MusehubRepo
760 ) -> None:
761 """No (symbol_a, symbol_b) duplicate rows for the same repo."""
762 c1, c2, c3 = _cid(), _cid(), _cid()
763 await _seed_commit(db_session, repo.repo_id, c1)
764 await _seed_commit(db_session, repo.repo_id, c2, [c1])
765 await _seed_commit(db_session, repo.repo_id, c3, [c2])
766 for cid in [c1, c2, c3]:
767 await _seed_history(db_session, repo.repo_id, cid, [
768 "src/a.py::fn_a", "src/b.py::fn_b",
769 ])
770 await db_session.commit()
771 for _ in range(3):
772 await _run_provider(db_session, repo.repo_id, c3)
773 pairs = await _fetch_pairs(db_session, repo.repo_id)
774 keys = [(p.symbol_a, p.symbol_b) for p in pairs]
775 assert len(keys) == len(set(keys))
776
777 @pytest.mark.asyncio
778 async def test_ET_35_rate_updates_on_new_commits(
779 self, db_session: AsyncSession, repo: MusehubRepo
780 ) -> None:
781 """Rate increases when more co-change commits are added."""
782 # Initial: A in 3 commits, B in 3 commits, co=2 → rate=2/3
783 commits = [_cid() for _ in range(3)]
784 prev = None
785 for cid in commits:
786 await _seed_commit(db_session, repo.repo_id, cid, [prev] if prev else [])
787 prev = cid
788 # A appears in all 3
789 for cid in commits:
790 await _seed_history(db_session, repo.repo_id, cid, ["src/a.py::fn_a"])
791 # B co-changes only in first 2
792 for cid in commits[:2]:
793 await _seed_history(db_session, repo.repo_id, cid, ["src/b.py::fn_b"])
794 await db_session.commit()
795 await _run_provider(db_session, repo.repo_id, commits[-1])
796 pairs_before = await _fetch_pairs(db_session, repo.repo_id)
797 rate_before = pairs_before[0].co_change_rate if pairs_before else 0.0
798
799 # Now add a commit where both co-change again
800 c_new = _cid()
801 await _seed_commit(db_session, repo.repo_id, c_new, [commits[-1]])
802 await _seed_history(db_session, repo.repo_id, c_new, [
803 "src/a.py::fn_a", "src/b.py::fn_b",
804 ])
805 await db_session.commit()
806 await _run_provider(db_session, repo.repo_id, c_new)
807 pairs_after = await _fetch_pairs(db_session, repo.repo_id)
808 rate_after = pairs_after[0].co_change_rate if pairs_after else 0.0
809 assert rate_after >= rate_before
810
811 @pytest.mark.asyncio
812 async def test_ET_36_truncated_flag_true_when_over_cap(
813 self, db_session: AsyncSession, repo: MusehubRepo
814 ) -> None:
815 """truncated=True when more pairs were found than MAX_PAIRS."""
816 provider = EntangleProvider()
817 commits = [_cid() for _ in range(3)]
818 prev = None
819 for cid in commits:
820 await _seed_commit(db_session, repo.repo_id, cid, [prev] if prev else [])
821 prev = cid
822 # 35 files × 2 syms → ~2415 cross-file pairs, exceeds MAX_PAIRS=500
823 addrs = [f"src/file_{i}.py::fn_{j}" for i in range(35) for j in range(2)]
824 for cid in commits:
825 await _seed_history(db_session, repo.repo_id, cid, addrs)
826 await db_session.commit()
827 result = await _run_provider(db_session, repo.repo_id, commits[-1])
828 key, payload = result[0]
829 assert payload["truncated"] is True
830
831
832 # ─────────────────────────────────────────────────────────────────────────────
833 # Tier 6 — Integrity: edge cases and data quality
834 # ─────────────────────────────────────────────────────────────────────────────
835
836 class TestEntangleIntegrity:
837
838 @pytest.mark.asyncio
839 async def test_ET_37_address_with_only_import_produces_no_pair(
840 self, db_session: AsyncSession, repo: MusehubRepo
841 ) -> None:
842 """A commit with only import pseudo-symbols generates no pair rows."""
843 c1, c2 = _cid(), _cid()
844 await _seed_commit(db_session, repo.repo_id, c1)
845 await _seed_commit(db_session, repo.repo_id, c2, [c1])
846 for cid in [c1, c2]:
847 await _seed_history(db_session, repo.repo_id, cid, [
848 "src/a.py::import::os",
849 "src/b.py::import::sys",
850 "src/c.py::import::typing",
851 ])
852 await db_session.commit()
853 await _run_provider(db_session, repo.repo_id, c2)
854 pairs = await _fetch_pairs(db_session, repo.repo_id)
855 assert pairs == []
856
857 @pytest.mark.asyncio
858 async def test_ET_38_mixed_valid_and_import_symbols(
859 self, db_session: AsyncSession, repo: MusehubRepo
860 ) -> None:
861 """Import symbols in same commit as real symbols don't pair with real ones."""
862 c1, c2 = _cid(), _cid()
863 await _seed_commit(db_session, repo.repo_id, c1)
864 await _seed_commit(db_session, repo.repo_id, c2, [c1])
865 for cid in [c1, c2]:
866 await _seed_history(db_session, repo.repo_id, cid, [
867 "src/a.py::real_fn",
868 "src/b.py::import::os", # filtered
869 "src/c.py::other_fn",
870 ])
871 await db_session.commit()
872 await _run_provider(db_session, repo.repo_id, c2)
873 pairs = await _fetch_pairs(db_session, repo.repo_id)
874 for p in pairs:
875 assert "::import::" not in p.symbol_a
876 assert "::import::" not in p.symbol_b
877
878 @pytest.mark.asyncio
879 async def test_ET_39_unknown_ref_in_bfs_returns_empty(
880 self, db_session: AsyncSession, repo: MusehubRepo
881 ) -> None:
882 """BFS from unknown ref produces no pairs (ref not in commit table)."""
883 unknown_ref = _cid()
884 result = await _run_provider(db_session, repo.repo_id, unknown_ref)
885 assert result == []
886
887 @pytest.mark.asyncio
888 async def test_ET_40_co_changes_count_exact(
889 self, db_session: AsyncSession, repo: MusehubRepo
890 ) -> None:
891 """co_changes is the exact number of commits where both symbols appeared."""
892 n_together = 4
893 n_solo_a = 2
894 commits_together = [_cid() for _ in range(n_together)]
895 commits_a_only = [_cid() for _ in range(n_solo_a)]
896 all_commits = commits_together + commits_a_only
897 prev = None
898 for cid in all_commits:
899 await _seed_commit(db_session, repo.repo_id, cid, [prev] if prev else [])
900 prev = cid
901 for cid in commits_together:
902 await _seed_history(db_session, repo.repo_id, cid, [
903 "src/a.py::fn_a", "src/b.py::fn_b",
904 ])
905 for cid in commits_a_only:
906 await _seed_history(db_session, repo.repo_id, cid, ["src/a.py::fn_a"])
907 await db_session.commit()
908 await _run_provider(db_session, repo.repo_id, all_commits[-1])
909 pairs = await _fetch_pairs(db_session, repo.repo_id)
910 assert len(pairs) == 1
911 # union = count_a + count_b - co_changes = (n_together + n_solo_a) + n_together - n_together
912 union = n_together + n_solo_a # = 6
913 assert pairs[0].co_changes == n_together
914 assert pairs[0].commits_both_active == union
915 assert pairs[0].co_change_rate == round(n_together / union, 6)
916
917 @pytest.mark.asyncio
918 async def test_ET_41_rate_capped_at_one(
919 self, db_session: AsyncSession, repo: MusehubRepo
920 ) -> None:
921 """co_change_rate is never > 1.0."""
922 commits = [_cid() for _ in range(5)]
923 prev = None
924 for cid in commits:
925 await _seed_commit(db_session, repo.repo_id, cid, [prev] if prev else [])
926 prev = cid
927 for cid in commits:
928 await _seed_history(db_session, repo.repo_id, cid, [
929 "src/a.py::fn_a", "src/b.py::fn_b",
930 ])
931 await db_session.commit()
932 await _run_provider(db_session, repo.repo_id, commits[-1])
933 pairs = await _fetch_pairs(db_session, repo.repo_id)
934 for p in pairs:
935 assert 0.0 <= p.co_change_rate <= 1.0
936
937
938 # ─────────────────────────────────────────────────────────────────────────────
939 # Tier 7 — Performance: timing bounds
940 # ─────────────────────────────────────────────────────────────────────────────
941
942 class TestEntanglePerformance:
943
944 @pytest.mark.asyncio
945 async def test_ET_42_ten_commits_ten_symbols_under_500ms(
946 self, db_session: AsyncSession, repo: MusehubRepo
947 ) -> None:
948 """10 commits × 10 symbols completes in under 500 ms."""
949 commits = [_cid() for _ in range(10)]
950 prev = None
951 for cid in commits:
952 await _seed_commit(db_session, repo.repo_id, cid, [prev] if prev else [])
953 prev = cid
954 addrs = [f"src/file_{i}.py::fn" for i in range(10)]
955 for cid in commits:
956 await _seed_history(db_session, repo.repo_id, cid, addrs)
957 await db_session.commit()
958 t0 = time.monotonic()
959 await _run_provider(db_session, repo.repo_id, commits[-1])
960 elapsed = time.monotonic() - t0
961 assert elapsed < 0.5, f"took {elapsed:.3f}s"
962
963 @pytest.mark.asyncio
964 async def test_ET_43_100_commits_20_symbols_under_2s(
965 self, db_session: AsyncSession, repo: MusehubRepo
966 ) -> None:
967 """100 commits × 20 symbols completes in under 2 s."""
968 commits = [_cid() for _ in range(100)]
969 prev = None
970 for cid in commits:
971 await _seed_commit(db_session, repo.repo_id, cid, [prev] if prev else [])
972 prev = cid
973 addrs = [f"src/f{i}.py::fn" for i in range(20)]
974 for cid in commits:
975 await _seed_history(db_session, repo.repo_id, cid, addrs)
976 await db_session.commit()
977 t0 = time.monotonic()
978 await _run_provider(db_session, repo.repo_id, commits[-1])
979 elapsed = time.monotonic() - t0
980 assert elapsed < 2.0, f"took {elapsed:.3f}s"
981
982 @pytest.mark.asyncio
983 async def test_ET_44_empty_repo_under_50ms(
984 self, db_session: AsyncSession, repo: MusehubRepo
985 ) -> None:
986 """Empty repo fast-path exits under 50 ms."""
987 t0 = time.monotonic()
988 await _run_provider(db_session, repo.repo_id, _cid())
989 elapsed = time.monotonic() - t0
990 assert elapsed < 0.05, f"took {elapsed:.3f}s"
991
992 @pytest.mark.asyncio
993 async def test_ET_45_rerun_same_speed_as_first(
994 self, db_session: AsyncSession, repo: MusehubRepo
995 ) -> None:
996 """Second run is not significantly slower than first run."""
997 c1, c2 = _cid(), _cid()
998 await _seed_commit(db_session, repo.repo_id, c1)
999 await _seed_commit(db_session, repo.repo_id, c2, [c1])
1000 for cid in [c1, c2]:
1001 await _seed_history(db_session, repo.repo_id, cid, [
1002 "src/a.py::fn_a", "src/b.py::fn_b",
1003 ])
1004 await db_session.commit()
1005 t1 = time.monotonic()
1006 await _run_provider(db_session, repo.repo_id, c2)
1007 d1 = time.monotonic() - t1
1008 t2 = time.monotonic()
1009 await _run_provider(db_session, repo.repo_id, c2)
1010 d2 = time.monotonic() - t2
1011 # second run should not be more than 5× slower
1012 assert d2 < max(d1 * 5, 0.5)
1013
1014 @pytest.mark.asyncio
1015 async def test_ET_46_point_lookup_fast(
1016 self, db_session: AsyncSession, repo: MusehubRepo
1017 ) -> None:
1018 """Fetching pairs for a specific repo is sub-10 ms after provider run."""
1019 c1, c2 = _cid(), _cid()
1020 await _seed_commit(db_session, repo.repo_id, c1)
1021 await _seed_commit(db_session, repo.repo_id, c2, [c1])
1022 for cid in [c1, c2]:
1023 await _seed_history(db_session, repo.repo_id, cid, [
1024 "src/a.py::fn_a", "src/b.py::fn_b",
1025 ])
1026 await db_session.commit()
1027 await _run_provider(db_session, repo.repo_id, c2)
1028 t0 = time.monotonic()
1029 await _fetch_pairs(db_session, repo.repo_id)
1030 elapsed = time.monotonic() - t0
1031 assert elapsed < 0.01, f"took {elapsed:.3f}s"
1032
1033
1034 # ─────────────────────────────────────────────────────────────────────────────
1035 # Tier 8 — Security: injection, isolation, address length
1036 # ─────────────────────────────────────────────────────────────────────────────
1037
1038 class TestEntangleSecurity:
1039
1040 @pytest.mark.asyncio
1041 async def test_ET_47_sql_injection_in_address_stored_verbatim(
1042 self, db_session: AsyncSession, repo: MusehubRepo
1043 ) -> None:
1044 """SQL injection strings in symbol addresses are stored as-is (no execution)."""
1045 inject = "src/a.py::fn'; DROP TABLE musehub_intel_entangle; --"
1046 c1, c2 = _cid(), _cid()
1047 await _seed_commit(db_session, repo.repo_id, c1)
1048 await _seed_commit(db_session, repo.repo_id, c2, [c1])
1049 for cid in [c1, c2]:
1050 await _seed_history(db_session, repo.repo_id, cid, [
1051 inject,
1052 "src/b.py::fn_b",
1053 ])
1054 await db_session.commit()
1055 await _run_provider(db_session, repo.repo_id, c2)
1056 # Table must still exist
1057 pairs = await _fetch_pairs(db_session, repo.repo_id)
1058 # The injection address should appear verbatim or be stored without issue
1059 assert isinstance(pairs, list)
1060
1061 @pytest.mark.asyncio
1062 async def test_ET_48_xss_payload_in_address_stored_safely(
1063 self, db_session: AsyncSession, repo: MusehubRepo
1064 ) -> None:
1065 """XSS payloads in addresses are stored without execution."""
1066 xss = "src/<script>alert(1)</script>.py::fn"
1067 c1, c2 = _cid(), _cid()
1068 await _seed_commit(db_session, repo.repo_id, c1)
1069 await _seed_commit(db_session, repo.repo_id, c2, [c1])
1070 for cid in [c1, c2]:
1071 await _seed_history(db_session, repo.repo_id, cid, [
1072 xss,
1073 "src/b.py::fn_b",
1074 ])
1075 await db_session.commit()
1076 await _run_provider(db_session, repo.repo_id, c2)
1077 pairs = await _fetch_pairs(db_session, repo.repo_id)
1078 assert isinstance(pairs, list)
1079
1080 @pytest.mark.asyncio
1081 async def test_ET_49_repo_isolation_strict(
1082 self, db_session: AsyncSession, two_repos: tuple[MusehubRepo, MusehubRepo]
1083 ) -> None:
1084 """Pairs from repo A are never visible when querying repo B."""
1085 r1, r2 = two_repos
1086 c1, c2 = _cid(), _cid()
1087 await _seed_commit(db_session, r1.repo_id, c1)
1088 await _seed_commit(db_session, r1.repo_id, c2, [c1])
1089 for cid in [c1, c2]:
1090 await _seed_history(db_session, r1.repo_id, cid, [
1091 "src/a.py::fn_a", "src/b.py::fn_b",
1092 ])
1093 await db_session.commit()
1094 await _run_provider(db_session, r1.repo_id, c2)
1095 # Repo 2 has no data
1096 pairs_r2 = await _fetch_pairs(db_session, r2.repo_id)
1097 assert pairs_r2 == []
1098
1099 @pytest.mark.asyncio
1100 async def test_ET_50_repo_isolation_no_cross_contamination(
1101 self, db_session: AsyncSession, two_repos: tuple[MusehubRepo, MusehubRepo]
1102 ) -> None:
1103 """Two repos each get their own independent pair sets."""
1104 r1, r2 = two_repos
1105 for repo in [r1, r2]:
1106 c1, c2 = _cid(), _cid()
1107 await _seed_commit(db_session, repo.repo_id, c1)
1108 await _seed_commit(db_session, repo.repo_id, c2, [c1])
1109 for cid in [c1, c2]:
1110 await _seed_history(db_session, repo.repo_id, cid, [
1111 "src/a.py::fn_a", "src/b.py::fn_b",
1112 ])
1113 await db_session.commit()
1114 await _run_provider(db_session, repo.repo_id, c2)
1115 pairs_r1 = await _fetch_pairs(db_session, r1.repo_id)
1116 pairs_r2 = await _fetch_pairs(db_session, r2.repo_id)
1117 assert len(pairs_r1) == 1
1118 assert len(pairs_r2) == 1
1119 assert pairs_r1[0].repo_id == r1.repo_id
1120 assert pairs_r2[0].repo_id == r2.repo_id
1121
1122 @pytest.mark.asyncio
1123 async def test_ET_51_delete_old_provider_run_on_rerun(
1124 self, db_session: AsyncSession, repo: MusehubRepo
1125 ) -> None:
1126 """Rerun for a different ref purges all previous rows for the repo."""
1127 c1, c2, c3 = _cid(), _cid(), _cid()
1128 await _seed_commit(db_session, repo.repo_id, c1)
1129 await _seed_commit(db_session, repo.repo_id, c2, [c1])
1130 await _seed_commit(db_session, repo.repo_id, c3, [c2])
1131 for cid in [c1, c2, c3]:
1132 await _seed_history(db_session, repo.repo_id, cid, [
1133 "src/a.py::fn_a", "src/b.py::fn_b",
1134 ])
1135 await db_session.commit()
1136 await _run_provider(db_session, repo.repo_id, c2)
1137 await _run_provider(db_session, repo.repo_id, c3)
1138 pairs = await _fetch_pairs(db_session, repo.repo_id)
1139 # All stored rows must point to the latest ref
1140 for p in pairs:
1141 assert p.ref == c3
1142
1143 @pytest.mark.asyncio
1144 async def test_ET_52_unicode_in_address_handled(
1145 self, db_session: AsyncSession, repo: MusehubRepo
1146 ) -> None:
1147 """Unicode characters in addresses do not crash the provider."""
1148 c1, c2 = _cid(), _cid()
1149 await _seed_commit(db_session, repo.repo_id, c1)
1150 await _seed_commit(db_session, repo.repo_id, c2, [c1])
1151 for cid in [c1, c2]:
1152 await _seed_history(db_session, repo.repo_id, cid, [
1153 "src/música.py::canción",
1154 "src/b.py::fn_b",
1155 ])
1156 await db_session.commit()
1157 await _run_provider(db_session, repo.repo_id, c2)
1158 pairs = await _fetch_pairs(db_session, repo.repo_id)
1159 assert isinstance(pairs, list)
1160
1161 @pytest.mark.asyncio
1162 async def test_ET_53_long_address_does_not_exceed_column_width(
1163 self, db_session: AsyncSession, repo: MusehubRepo
1164 ) -> None:
1165 """Addresses truncated to 512 chars by the route layer don't crash storage."""
1166 long_addr_a = "src/" + "a" * 500 + ".py::fn"
1167 long_addr_b = "src/" + "b" * 500 + ".py::fn"
1168 # These exceed 512 chars — simulate what the route-layer would see
1169 # The provider itself stores verbatim; the model column is VARCHAR(512)
1170 # so the DB will reject anything longer. Just verify the provider
1171 # doesn't crash on realistic (under 512) addresses.
1172 addr_a = f"{long_addr_a[:100]}::fn_a"
1173 addr_b = f"{long_addr_b[:100]}::fn_b"
1174 c1, c2 = _cid(), _cid()
1175 await _seed_commit(db_session, repo.repo_id, c1)
1176 await _seed_commit(db_session, repo.repo_id, c2, [c1])
1177 for cid in [c1, c2]:
1178 await _seed_history(db_session, repo.repo_id, cid, [addr_a, addr_b])
1179 await db_session.commit()
1180 await _run_provider(db_session, repo.repo_id, c2)
1181 pairs = await _fetch_pairs(db_session, repo.repo_id)
1182 assert len(pairs) == 1
1183
1184 @pytest.mark.asyncio
1185 async def test_ET_54_newline_in_address_stored_verbatim(
1186 self, db_session: AsyncSession, repo: MusehubRepo
1187 ) -> None:
1188 """Newline characters in addresses don't trigger injections or errors."""
1189 addr_a = "src/a.py::fn\n_a"
1190 addr_b = "src/b.py::fn_b"
1191 c1, c2 = _cid(), _cid()
1192 await _seed_commit(db_session, repo.repo_id, c1)
1193 await _seed_commit(db_session, repo.repo_id, c2, [c1])
1194 for cid in [c1, c2]:
1195 await _seed_history(db_session, repo.repo_id, cid, [addr_a, addr_b])
1196 await db_session.commit()
1197 await _run_provider(db_session, repo.repo_id, c2)
1198 pairs = await _fetch_pairs(db_session, repo.repo_id)
1199 assert isinstance(pairs, list)
File History 1 commit
sha256:ef10830ce231e0a20efcb0e2586cb879471247e916616e6fdd0d51df459e2595 fix: typing audit — 0 violations, 0 untyped defs across all… Sonnet 4.6 minor 20 days ago