tests/test_entangle_provider.py · gabriel/musehub

test_entangle_provider.py python

1,199 lines 52.0 KB

sha256:ef10830ce231e0a20efcb0e2586cb879471247e916616e6fdd0d51df459e2595 fix: typing audit — 0 violations, 0 untyped defs across all… Sonnet 4.6 minor ⚠ breaking 20 days ago

1	"""TDD spec for EntangleProvider — issue #13, Phase 5.
2
3	Verifies that EntangleProvider reproduces the same co-change analysis as
4	``muse code entangle``: Jaccard-min rate, import filter, mass-commit exclusion,
5	canonical pair ordering, and repo isolation.
6
7	Eight test tiers (54 cases)
8	---------------------------
9	Unit ET_01 – ET_08 rate formula, import filter, pair canonicalisation
10	Integration ET_09 – ET_18 provider upserts, re-runs, row counts
11	E2E ET_19 – ET_25 full seeded scenarios
12	Stress ET_26 – ET_30 500-symbol batch, mass-commit exclusion
13	State ET_31 – ET_36 idempotency, incremental updates, stale-row purge
14	Integrity ET_37 – ET_41 corrupt addresses, NULL exclusion, file-same filter
15	Performance ET_42 – ET_46 timing bounds on realistic datasets
16	Security ET_47 – ET_54 injection strings, repo isolation, address length cap
17	"""
18	from __future__ import annotations
19
20	import secrets
21	import time
22	from collections import defaultdict
23	from itertools import combinations
24
25	import pytest
26	import pytest_asyncio
27	import sqlalchemy as sa
28	from sqlalchemy.dialects.postgresql import insert as pg_insert
29	from sqlalchemy.ext.asyncio import AsyncSession
30
31	from muse.core.types import fake_id, long_id
32	from musehub.db.musehub_intel_models import MusehubIntelEntangle, MusehubSymbolHistoryEntry
33	from musehub.db.musehub_repo_models import MusehubCommit, MusehubCommitRef, MusehubRepo
34	from musehub.services.musehub_intel_providers import EntangleProvider
35	from musehub.types.json_types import JSONObject
36	from tests.factories import create_repo
37
38
39	# ─────────────────────────────────────────────────────────────────────────────
40	# Helpers
41	# ─────────────────────────────────────────────────────────────────────────────
42
43	def _uid() -> str:
44	return fake_id(secrets.token_hex(16))
45
46
47	def _cid() -> str:
48	return long_id(secrets.token_hex(32))
49
50
51	_OWNER = "testuser"
52	_SLUG = "entangleprovider"
53
54
55	async def _seed_commit(
56	session: AsyncSession,
57	repo_id: str,
58	commit_id: str,
59	parent_ids: list[str] \| None = None,
60	) -> None:
61	from datetime import datetime, timezone
62	stmt = (
63	pg_insert(MusehubCommit)
64	.values(
65	commit_id=commit_id,
66	message="test commit",
67	author="test",
68	branch="dev",
69	parent_ids=parent_ids or [],
70	snapshot_id=None,
71	timestamp=datetime.now(tz=timezone.utc),
72	)
73	.on_conflict_do_nothing()
74	)
75	await session.execute(stmt)
76	ref_stmt = (
77	pg_insert(MusehubCommitRef)
78	.values(repo_id=repo_id, commit_id=commit_id)
79	.on_conflict_do_nothing()
80	)
81	await session.execute(ref_stmt)
82
83
84	async def _seed_history(
85	session: AsyncSession,
86	repo_id: str,
87	commit_id: str,
88	addresses: list[str],
89	) -> None:
90	from datetime import datetime, timezone
91	now = datetime.now(tz=timezone.utc)
92	for addr in addresses:
93	stmt = (
94	pg_insert(MusehubSymbolHistoryEntry)
95	.values(
96	repo_id=repo_id,
97	address=addr,
98	commit_id=commit_id,
99	committed_at=now,
100	op="update",
101	)
102	.on_conflict_do_nothing()
103	)
104	await session.execute(stmt)
105
106
107	async def _run_provider(
108	session: AsyncSession, repo_id: str, ref: str
109	) -> list[tuple[str, JSONObject]]:
110	return await EntangleProvider().compute(session, repo_id, ref, {})
111
112
113	async def _fetch_pairs(
114	session: AsyncSession, repo_id: str
115	) -> list[MusehubIntelEntangle]:
116	result = await session.execute(
117	sa.select(MusehubIntelEntangle)
118	.where(MusehubIntelEntangle.repo_id == repo_id)
119	.order_by(
120	sa.desc(MusehubIntelEntangle.co_change_rate),
121	sa.desc(MusehubIntelEntangle.co_changes),
122	)
123	)
124	return list(result.scalars().all())
125
126
127	# ─────────────────────────────────────────────────────────────────────────────
128	# Fixtures
129	# ─────────────────────────────────────────────────────────────────────────────
130
131	@pytest_asyncio.fixture
132	async def repo(db_session: AsyncSession) -> MusehubRepo:
133	return await create_repo(db_session, owner=_OWNER, slug=_SLUG)
134
135
136	@pytest_asyncio.fixture
137	async def two_repos(db_session: AsyncSession) -> tuple[MusehubRepo, MusehubRepo]:
138	r1 = await create_repo(db_session, owner=_OWNER, slug="et-repo-1")
139	r2 = await create_repo(db_session, owner=_OWNER, slug="et-repo-2")
140	return r1, r2
141
142
143	# ─────────────────────────────────────────────────────────────────────────────
144	# Tier 1 — Unit: rate formula, import filter, pair canonicalisation
145	# ─────────────────────────────────────────────────────────────────────────────
146
147	class TestEntangleUnit:
148	"""Pure-function unit tests — no database required."""
149
150	def test_ET_01_jaccard_min_rate_perfect(self) -> None:
151	"""100% rate: A and B co-change in every commit both appear."""
152	symbol_commits = {
153	"src/billing.py::charge": {"c1", "c2", "c3"},
154	"src/ledger.py::record": {"c1", "c2", "c3"},
155	}
156	a, b = "src/billing.py::charge", "src/ledger.py::record"
157	co = 3
158	rate = co / min(len(symbol_commits[a]), len(symbol_commits[b]))
159	assert rate == 1.0
160
161	def test_ET_02_jaccard_min_rate_partial(self) -> None:
162	"""Partial rate: B appears only in a subset of A's commits."""
163	symbol_commits = {
164	"src/a.py::fn1": {"c1", "c2", "c3", "c4", "c5"},
165	"src/b.py::fn2": {"c1", "c2"},
166	}
167	a, b = "src/a.py::fn1", "src/b.py::fn2"
168	co = 2
169	rate = co / min(len(symbol_commits[a]), len(symbol_commits[b]))
170	assert rate == 1.0
171
172	def test_ET_03_jaccard_min_rate_low(self) -> None:
173	"""Low coupling: only 1 of 10 of B's commits overlap."""
174	symbol_commits = {
175	"src/a.py::fn1": {"c1"},
176	"src/b.py::fn2": {"c1", "c2", "c3", "c4", "c5", "c6", "c7", "c8", "c9", "c10"},
177	}
178	a, b = "src/a.py::fn1", "src/b.py::fn2"
179	co = 1
180	rate = co / min(len(symbol_commits[a]), len(symbol_commits[b]))
181	assert rate == 1.0
182
183	def test_ET_04_import_pseudo_symbol_excluded(self) -> None:
184	"""Addresses containing ::import:: must be filtered."""
185	addr = "src/billing.py::import::os"
186	assert "::import::" in addr
187
188	def test_ET_05_bare_path_excluded(self) -> None:
189	"""Addresses without '::' are bare file paths — not symbols."""
190	addr = "cloudflare"
191	assert "::" not in addr
192
193	def test_ET_06_pair_key_canonical_ordering(self) -> None:
194	"""Pair key is always (a, b) where a < b lexicographically."""
195	syms = ["src/z.py::zfn", "src/a.py::afn"]
196	canonical = tuple(sorted(syms))
197	assert canonical == ("src/a.py::afn", "src/z.py::zfn")
198
199	def test_ET_07_same_file_pairs_excluded(self) -> None:
200	"""Pairs where file_a == file_b must be excluded."""
201	a = "src/billing.py::charge"
202	b = "src/billing.py::refund"
203	assert a.split("::")[0] == b.split("::")[0]
204
205	def test_ET_08_min_co_changes_threshold(self) -> None:
206	"""Pairs with co_changes < 2 are noise — must be excluded."""
207	provider = EntangleProvider()
208	assert provider._MIN_CO_CHANGES == 2
209
210
211	# ─────────────────────────────────────────────────────────────────────────────
212	# Tier 2 — Integration: provider upserts, reruns, row counts
213	# ─────────────────────────────────────────────────────────────────────────────
214
215	class TestEntangleIntegration:
216
217	@pytest.mark.asyncio
218	async def test_ET_09_empty_repo_returns_empty(
219	self, db_session: AsyncSession, repo: MusehubRepo
220	) -> None:
221	"""Provider on a repo with no commits returns empty results."""
222	ref = _cid()
223	result = await _run_provider(db_session, repo.repo_id, ref)
224	assert result == []
225	pairs = await _fetch_pairs(db_session, repo.repo_id)
226	assert pairs == []
227
228	@pytest.mark.asyncio
229	async def test_ET_10_no_history_entries_returns_empty(
230	self, db_session: AsyncSession, repo: MusehubRepo
231	) -> None:
232	"""Commits exist but no history entries → no pairs."""
233	c1, c2 = _cid(), _cid()
234	await _seed_commit(db_session, repo.repo_id, c1)
235	await _seed_commit(db_session, repo.repo_id, c2, [c1])
236	await db_session.commit()
237	result = await _run_provider(db_session, c2, {})
238	assert result == []
239
240	@pytest.mark.asyncio
241	async def test_ET_11_two_symbols_in_one_commit_no_pair(
242	self, db_session: AsyncSession, repo: MusehubRepo
243	) -> None:
244	"""Single co-change commit yields co_changes=1 — below MIN_CO_CHANGES=2."""
245	c1 = _cid()
246	await _seed_commit(db_session, repo.repo_id, c1)
247	await _seed_history(db_session, repo.repo_id, c1, [
248	"src/a.py::fn_a", "src/b.py::fn_b",
249	])
250	await db_session.commit()
251	await _run_provider(db_session, repo.repo_id, c1)
252	pairs = await _fetch_pairs(db_session, repo.repo_id)
253	assert pairs == []
254
255	@pytest.mark.asyncio
256	async def test_ET_12_two_co_changes_produces_one_pair(
257	self, db_session: AsyncSession, repo: MusehubRepo
258	) -> None:
259	"""Exactly 2 co-change commits → 1 pair at rate 1.0."""
260	c1, c2 = _cid(), _cid()
261	await _seed_commit(db_session, repo.repo_id, c1)
262	await _seed_commit(db_session, repo.repo_id, c2, [c1])
263	for cid in [c1, c2]:
264	await _seed_history(db_session, repo.repo_id, cid, [
265	"src/billing.py::charge",
266	"src/ledger.py::record",
267	])
268	await db_session.commit()
269	await _run_provider(db_session, repo.repo_id, c2)
270	pairs = await _fetch_pairs(db_session, repo.repo_id)
271	assert len(pairs) == 1
272	p = pairs[0]
273	assert p.co_changes == 2
274	assert p.co_change_rate == 1.0
275
276	@pytest.mark.asyncio
277	async def test_ET_13_import_symbols_excluded(
278	self, db_session: AsyncSession, repo: MusehubRepo
279	) -> None:
280	"""Import pseudo-symbols are not stored as entangle pairs."""
281	c1, c2 = _cid(), _cid()
282	await _seed_commit(db_session, repo.repo_id, c1)
283	await _seed_commit(db_session, repo.repo_id, c2, [c1])
284	for cid in [c1, c2]:
285	await _seed_history(db_session, repo.repo_id, cid, [
286	"src/a.py::import::os",
287	"src/b.py::import::sys",
288	"src/a.py::real_fn",
289	])
290	await db_session.commit()
291	await _run_provider(db_session, repo.repo_id, c2)
292	pairs = await _fetch_pairs(db_session, repo.repo_id)
293	for p in pairs:
294	assert "::import::" not in p.symbol_a
295	assert "::import::" not in p.symbol_b
296
297	@pytest.mark.asyncio
298	async def test_ET_14_bare_path_addresses_excluded(
299	self, db_session: AsyncSession, repo: MusehubRepo
300	) -> None:
301	"""Bare path entries (no '::') are not treated as symbols."""
302	c1, c2 = _cid(), _cid()
303	await _seed_commit(db_session, repo.repo_id, c1)
304	await _seed_commit(db_session, repo.repo_id, c2, [c1])
305	for cid in [c1, c2]:
306	await _seed_history(db_session, repo.repo_id, cid, [
307	"cloudflare",
308	"src/a.py::real_fn",
309	])
310	await db_session.commit()
311	await _run_provider(db_session, repo.repo_id, c2)
312	pairs = await _fetch_pairs(db_session, repo.repo_id)
313	for p in pairs:
314	assert "::" in p.symbol_a
315	assert "::" in p.symbol_b
316
317	@pytest.mark.asyncio
318	async def test_ET_15_same_file_pair_excluded(
319	self, db_session: AsyncSession, repo: MusehubRepo
320	) -> None:
321	"""Two symbols from the same file must not produce a pair."""
322	c1, c2 = _cid(), _cid()
323	await _seed_commit(db_session, repo.repo_id, c1)
324	await _seed_commit(db_session, repo.repo_id, c2, [c1])
325	for cid in [c1, c2]:
326	await _seed_history(db_session, repo.repo_id, cid, [
327	"src/billing.py::charge",
328	"src/billing.py::refund",
329	])
330	await db_session.commit()
331	await _run_provider(db_session, repo.repo_id, c2)
332	pairs = await _fetch_pairs(db_session, repo.repo_id)
333	assert pairs == []
334
335	@pytest.mark.asyncio
336	async def test_ET_16_pair_stored_canonical_a_lt_b(
337	self, db_session: AsyncSession, repo: MusehubRepo
338	) -> None:
339	"""Stored pair always has symbol_a < symbol_b lexicographically."""
340	c1, c2 = _cid(), _cid()
341	await _seed_commit(db_session, repo.repo_id, c1)
342	await _seed_commit(db_session, repo.repo_id, c2, [c1])
343	for cid in [c1, c2]:
344	await _seed_history(db_session, repo.repo_id, cid, [
345	"src/z.py::zfn",
346	"src/a.py::afn",
347	])
348	await db_session.commit()
349	await _run_provider(db_session, repo.repo_id, c2)
350	pairs = await _fetch_pairs(db_session, repo.repo_id)
351	assert len(pairs) == 1
352	assert pairs[0].symbol_a <= pairs[0].symbol_b
353
354	@pytest.mark.asyncio
355	async def test_ET_17_file_a_b_populated(
356	self, db_session: AsyncSession, repo: MusehubRepo
357	) -> None:
358	"""file_a and file_b columns derive from the symbol address."""
359	c1, c2 = _cid(), _cid()
360	await _seed_commit(db_session, repo.repo_id, c1)
361	await _seed_commit(db_session, repo.repo_id, c2, [c1])
362	for cid in [c1, c2]:
363	await _seed_history(db_session, repo.repo_id, cid, [
364	"src/billing.py::charge",
365	"src/ledger.py::record",
366	])
367	await db_session.commit()
368	await _run_provider(db_session, repo.repo_id, c2)
369	pairs = await _fetch_pairs(db_session, repo.repo_id)
370	assert len(pairs) == 1
371	p = pairs[0]
372	assert p.file_a is not None and "/" in p.file_a
373	assert p.file_b is not None and "/" in p.file_b
374	assert p.file_a != p.file_b
375
376	@pytest.mark.asyncio
377	async def test_ET_18_commits_both_active_is_min(
378	self, db_session: AsyncSession, repo: MusehubRepo
379	) -> None:
380	"""commits_both_active equals \|commits_a ∪ commits_b\| (Jaccard union)."""
381	# B appears in 2 commits; A in 4 commits; co_changes = 2
382	# union = 4 + 2 - 2 = 4; rate = 2/4 = 0.5
383	commits = [_cid() for _ in range(4)]
384	prev = None
385	for cid in commits:
386	await _seed_commit(db_session, repo.repo_id, cid, [prev] if prev else [])
387	prev = cid
388	# A in all 4
389	for cid in commits:
390	await _seed_history(db_session, repo.repo_id, cid, ["src/a.py::fn_a"])
391	# B only in first 2
392	for cid in commits[:2]:
393	await _seed_history(db_session, repo.repo_id, cid, ["src/b.py::fn_b"])
394	await db_session.commit()
395	await _run_provider(db_session, repo.repo_id, commits[-1])
396	pairs = await _fetch_pairs(db_session, repo.repo_id)
397	assert len(pairs) == 1
398	assert pairs[0].commits_both_active == 4 # union: 4 + 2 - 2
399	assert pairs[0].co_changes == 2
400	assert pairs[0].co_change_rate == 0.5
401
402
403	# ─────────────────────────────────────────────────────────────────────────────
404	# Tier 3 — E2E: full seeded scenarios
405	# ─────────────────────────────────────────────────────────────────────────────
406
407	class TestEntangleE2E:
408
409	@pytest.mark.asyncio
410	async def test_ET_19_three_symbol_pair_ranking(
411	self, db_session: AsyncSession, repo: MusehubRepo
412	) -> None:
413	"""Three symbols; AB pairs more than AC; AB ranked first."""
414	commits = [_cid() for _ in range(5)]
415	prev = None
416	for cid in commits:
417	await _seed_commit(db_session, repo.repo_id, cid, [prev] if prev else [])
418	prev = cid
419	# A+B co-change in all 5
420	for cid in commits:
421	await _seed_history(db_session, repo.repo_id, cid, [
422	"src/a.py::fn_a",
423	"src/b.py::fn_b",
424	])
425	# A+C co-change in only 2
426	for cid in commits[:2]:
427	await _seed_history(db_session, repo.repo_id, cid, ["src/c.py::fn_c"])
428	await db_session.commit()
429	await _run_provider(db_session, repo.repo_id, commits[-1])
430	pairs = await _fetch_pairs(db_session, repo.repo_id)
431	assert len(pairs) == 3
432	# AB at 1.0 should come first (most co_changes)
433	assert pairs[0].co_change_rate == 1.0
434
435	@pytest.mark.asyncio
436	async def test_ET_20_a_in_test_flag_set_for_test_files(
437	self, db_session: AsyncSession, repo: MusehubRepo
438	) -> None:
439	"""a_in_test / b_in_test flags set when file path contains 'test'."""
440	c1, c2 = _cid(), _cid()
441	await _seed_commit(db_session, repo.repo_id, c1)
442	await _seed_commit(db_session, repo.repo_id, c2, [c1])
443	for cid in [c1, c2]:
444	await _seed_history(db_session, repo.repo_id, cid, [
445	"tests/test_billing.py::test_charge",
446	"src/ledger.py::record",
447	])
448	await db_session.commit()
449	await _run_provider(db_session, repo.repo_id, c2)
450	pairs = await _fetch_pairs(db_session, repo.repo_id)
451	assert len(pairs) == 1
452	p = pairs[0]
453	# one side is in test, the other is not
454	assert p.a_in_test != p.b_in_test
455
456	@pytest.mark.asyncio
457	async def test_ET_21_result_metadata_keys(
458	self, db_session: AsyncSession, repo: MusehubRepo
459	) -> None:
460	"""Provider returns (key, payload) tuples with expected metadata keys."""
461	c1, c2 = _cid(), _cid()
462	await _seed_commit(db_session, repo.repo_id, c1)
463	await _seed_commit(db_session, repo.repo_id, c2, [c1])
464	for cid in [c1, c2]:
465	await _seed_history(db_session, repo.repo_id, cid, [
466	"src/a.py::fn_a", "src/b.py::fn_b",
467	])
468	await db_session.commit()
469	result = await _run_provider(db_session, repo.repo_id, c2)
470	assert len(result) == 1
471	key, payload = result[0]
472	assert key == "intel.code.entangle"
473	assert "count" in payload
474	assert "commits_analysed" in payload
475	assert "truncated" in payload
476
477	@pytest.mark.asyncio
478	async def test_ET_22_ref_stored_on_pair_row(
479	self, db_session: AsyncSession, repo: MusehubRepo
480	) -> None:
481	"""The ref used for the walk is stored on each pair row."""
482	c1, c2 = _cid(), _cid()
483	await _seed_commit(db_session, repo.repo_id, c1)
484	await _seed_commit(db_session, repo.repo_id, c2, [c1])
485	for cid in [c1, c2]:
486	await _seed_history(db_session, repo.repo_id, cid, [
487	"src/a.py::fn_a", "src/b.py::fn_b",
488	])
489	await db_session.commit()
490	await _run_provider(db_session, repo.repo_id, c2)
491	pairs = await _fetch_pairs(db_session, repo.repo_id)
492	assert len(pairs) == 1
493	assert pairs[0].ref == c2
494
495	@pytest.mark.asyncio
496	async def test_ET_23_multiple_disconnected_pairs(
497	self, db_session: AsyncSession, repo: MusehubRepo
498	) -> None:
499	"""Two independent high-rate pairs are both stored correctly."""
500	c1, c2, c3 = _cid(), _cid(), _cid()
501	await _seed_commit(db_session, repo.repo_id, c1)
502	await _seed_commit(db_session, repo.repo_id, c2, [c1])
503	await _seed_commit(db_session, repo.repo_id, c3, [c2])
504	for cid in [c1, c2, c3]:
505	await _seed_history(db_session, repo.repo_id, cid, [
506	"src/alpha.py::a1", "src/beta.py::b1", # pair 1
507	"src/gamma.py::c1", "src/delta.py::d1", # pair 2
508	])
509	await db_session.commit()
510	await _run_provider(db_session, repo.repo_id, c3)
511	pairs = await _fetch_pairs(db_session, repo.repo_id)
512	# At least 2 cross-file pairs
513	assert len(pairs) >= 2
514
515	@pytest.mark.asyncio
516	async def test_ET_24_structurally_linked_defaults_false(
517	self, db_session: AsyncSession, repo: MusehubRepo
518	) -> None:
519	"""structurally_linked is always False — not yet implemented."""
520	c1, c2 = _cid(), _cid()
521	await _seed_commit(db_session, repo.repo_id, c1)
522	await _seed_commit(db_session, repo.repo_id, c2, [c1])
523	for cid in [c1, c2]:
524	await _seed_history(db_session, repo.repo_id, cid, [
525	"src/a.py::fn_a", "src/b.py::fn_b",
526	])
527	await db_session.commit()
528	await _run_provider(db_session, repo.repo_id, c2)
529	pairs = await _fetch_pairs(db_session, repo.repo_id)
530	assert all(p.structurally_linked is False for p in pairs)
531
532	@pytest.mark.asyncio
533	async def test_ET_25_same_file_false_on_stored_pair(
534	self, db_session: AsyncSession, repo: MusehubRepo
535	) -> None:
536	"""same_file is always False since same-file pairs are excluded."""
537	c1, c2 = _cid(), _cid()
538	await _seed_commit(db_session, repo.repo_id, c1)
539	await _seed_commit(db_session, repo.repo_id, c2, [c1])
540	for cid in [c1, c2]:
541	await _seed_history(db_session, repo.repo_id, cid, [
542	"src/a.py::fn_a", "src/b.py::fn_b",
543	])
544	await db_session.commit()
545	await _run_provider(db_session, repo.repo_id, c2)
546	pairs = await _fetch_pairs(db_session, repo.repo_id)
547	assert all(p.same_file is False for p in pairs)
548
549
550	# ─────────────────────────────────────────────────────────────────────────────
551	# Tier 4 — Stress: large datasets
552	# ─────────────────────────────────────────────────────────────────────────────
553
554	class TestEntangleStress:
555
556	@pytest.mark.asyncio
557	async def test_ET_26_max_pairs_cap_respected(
558	self, db_session: AsyncSession, repo: MusehubRepo
559	) -> None:
560	"""Provider stores at most MAX_PAIRS pairs even when more exist."""
561	provider = EntangleProvider()
562	# Build enough distinct cross-file pairs by spreading symbols
563	# across 35 files × 2 symbols = 70 symbols → 70*69/2 ≈ 2415 pairs before filter
564	commits = [_cid() for _ in range(3)]
565	prev = None
566	for cid in commits:
567	await _seed_commit(db_session, repo.repo_id, cid, [prev] if prev else [])
568	prev = cid
569	addrs = [f"src/file_{i}.py::fn_{j}" for i in range(35) for j in range(2)]
570	for cid in commits:
571	await _seed_history(db_session, repo.repo_id, cid, addrs)
572	await db_session.commit()
573	await _run_provider(db_session, repo.repo_id, commits[-1])
574	pairs = await _fetch_pairs(db_session, repo.repo_id)
575	assert len(pairs) <= provider._MAX_PAIRS
576
577	@pytest.mark.asyncio
578	async def test_ET_27_mass_commit_excluded(
579	self, db_session: AsyncSession, repo: MusehubRepo
580	) -> None:
581	"""Commits touching > MAX_SYMBOLS_PER_COMMIT symbols are skipped."""
582	provider = EntangleProvider()
583	# Seed two legit commits and one mass commit
584	c_legit1, c_legit2, c_mass = _cid(), _cid(), _cid()
585	await _seed_commit(db_session, repo.repo_id, c_legit1)
586	await _seed_commit(db_session, repo.repo_id, c_legit2, [c_legit1])
587	await _seed_commit(db_session, repo.repo_id, c_mass, [c_legit2])
588	# Legit commits: A and B co-change
589	for cid in [c_legit1, c_legit2]:
590	await _seed_history(db_session, repo.repo_id, cid, [
591	"src/a.py::fn_a", "src/b.py::fn_b",
592	])
593	# Mass commit: 600 symbols
594	big_addrs = [f"src/gen_{i}.py::fn" for i in range(provider._MAX_SYMBOLS_PER_COMMIT + 100)]
595	await _seed_history(db_session, repo.repo_id, c_mass, big_addrs)
596	await db_session.commit()
597	result = await _run_provider(db_session, repo.repo_id, c_mass)
598	# Provider should still return the AB pair from legit commits
599	pairs = await _fetch_pairs(db_session, repo.repo_id)
600	assert any(
601	("src/a.py::fn_a" in (p.symbol_a, p.symbol_b))
602	for p in pairs
603	)
604
605	@pytest.mark.asyncio
606	async def test_ET_28_500_symbols_completes(
607	self, db_session: AsyncSession, repo: MusehubRepo
608	) -> None:
609	"""500 symbols across 10 commits completes without error."""
610	commits = [_cid() for _ in range(10)]
611	prev = None
612	for cid in commits:
613	await _seed_commit(db_session, repo.repo_id, cid, [prev] if prev else [])
614	prev = cid
615	# 250 files × 2 symbols = 500 symbols (all under mass-commit limit)
616	addrs = [f"src/f{i}.py::fn_{j}" for i in range(250) for j in range(2)]
617	for cid in commits:
618	await _seed_history(db_session, repo.repo_id, cid, addrs)
619	await db_session.commit()
620	result = await _run_provider(db_session, repo.repo_id, commits[-1])
621	assert result # no exception
622
623	@pytest.mark.asyncio
624	async def test_ET_29_result_count_matches_stored_rows(
625	self, db_session: AsyncSession, repo: MusehubRepo
626	) -> None:
627	"""metadata 'count' matches the actual number of rows stored."""
628	c1, c2, c3 = _cid(), _cid(), _cid()
629	await _seed_commit(db_session, repo.repo_id, c1)
630	await _seed_commit(db_session, repo.repo_id, c2, [c1])
631	await _seed_commit(db_session, repo.repo_id, c3, [c2])
632	for cid in [c1, c2, c3]:
633	await _seed_history(db_session, repo.repo_id, cid, [
634	"src/a.py::fn_a",
635	"src/b.py::fn_b",
636	"src/c.py::fn_c",
637	])
638	await db_session.commit()
639	result = await _run_provider(db_session, repo.repo_id, c3)
640	key, payload = result[0]
641	pairs = await _fetch_pairs(db_session, repo.repo_id)
642	assert payload["count"] == len(pairs)
643
644	@pytest.mark.asyncio
645	async def test_ET_30_bfs_walk_cap_limits_commits_analysed(
646	self, db_session: AsyncSession, repo: MusehubRepo
647	) -> None:
648	"""commits_analysed never exceeds MAX_WALK."""
649	provider = EntangleProvider()
650	cap = provider._MAX_WALK
651	commits = []
652	prev = None
653	for _ in range(min(cap + 5, 50)): # keep it fast; just verify cap exists
654	cid = _cid()
655	await _seed_commit(db_session, repo.repo_id, cid, [prev] if prev else [])
656	commits.append(cid)
657	prev = cid
658	await _seed_history(db_session, repo.repo_id, commits[0], [
659	"src/a.py::fn_a", "src/b.py::fn_b",
660	])
661	await db_session.commit()
662	result = await _run_provider(db_session, repo.repo_id, commits[-1])
663	if result:
664	key, payload = result[0]
665	assert payload["commits_analysed"] <= cap
666
667
668	# ─────────────────────────────────────────────────────────────────────────────
669	# Tier 5 — State: idempotency, incremental updates, stale-row purge
670	# ─────────────────────────────────────────────────────────────────────────────
671
672	class TestEntangleState:
673
674	@pytest.mark.asyncio
675	async def test_ET_31_idempotent_rerun_same_rows(
676	self, db_session: AsyncSession, repo: MusehubRepo
677	) -> None:
678	"""Running the provider twice produces the same set of rows."""
679	c1, c2 = _cid(), _cid()
680	await _seed_commit(db_session, repo.repo_id, c1)
681	await _seed_commit(db_session, repo.repo_id, c2, [c1])
682	for cid in [c1, c2]:
683	await _seed_history(db_session, repo.repo_id, cid, [
684	"src/a.py::fn_a", "src/b.py::fn_b",
685	])
686	await db_session.commit()
687	await _run_provider(db_session, repo.repo_id, c2)
688	first_run = await _fetch_pairs(db_session, repo.repo_id)
689	await _run_provider(db_session, repo.repo_id, c2)
690	second_run = await _fetch_pairs(db_session, repo.repo_id)
691	assert len(first_run) == len(second_run)
692	assert {(p.symbol_a, p.symbol_b) for p in first_run} == {
693	(p.symbol_a, p.symbol_b) for p in second_run
694	}
695
696	@pytest.mark.asyncio
697	async def test_ET_32_stale_rows_purged_on_rerun(
698	self, db_session: AsyncSession, repo: MusehubRepo
699	) -> None:
700	"""Re-run deletes stale pairs that no longer exist in fresh data."""
701	c1, c2 = _cid(), _cid()
702	await _seed_commit(db_session, repo.repo_id, c1)
703	await _seed_commit(db_session, repo.repo_id, c2, [c1])
704	for cid in [c1, c2]:
705	await _seed_history(db_session, repo.repo_id, cid, [
706	"src/a.py::fn_a", "src/b.py::fn_b",
707	])
708	await db_session.commit()
709	await _run_provider(db_session, repo.repo_id, c2)
710	first_count_result = await db_session.execute(
711	sa.select(sa.func.count()).select_from(MusehubIntelEntangle)
712	.where(MusehubIntelEntangle.repo_id == repo.repo_id)
713	)
714	assert first_count_result.scalar_one() == 1
715
716	# Add a new commit that breaks the entangle signal (different symbols)
717	c3 = _cid()
718	await _seed_commit(db_session, repo.repo_id, c3, [c2])
719	await _seed_history(db_session, repo.repo_id, c3, [
720	"src/x.py::fn_x", # completely different
721	])
722	# Re-run; AB pair should still exist (still valid from c1, c2)
723	await db_session.commit()
724	await _run_provider(db_session, repo.repo_id, c3)
725	second_run = await _fetch_pairs(db_session, repo.repo_id)
726	assert len(second_run) == 1 # AB still valid
727
728	@pytest.mark.asyncio
729	async def test_ET_33_incremental_new_pair_appears(
730	self, db_session: AsyncSession, repo: MusehubRepo
731	) -> None:
732	"""After adding commits, a new pair materialises on re-run."""
733	c1, c2 = _cid(), _cid()
734	await _seed_commit(db_session, repo.repo_id, c1)
735	await _seed_commit(db_session, repo.repo_id, c2, [c1])
736	for cid in [c1, c2]:
737	await _seed_history(db_session, repo.repo_id, cid, [
738	"src/a.py::fn_a", "src/b.py::fn_b",
739	])
740	await db_session.commit()
741	await _run_provider(db_session, repo.repo_id, c2)
742	before = await _fetch_pairs(db_session, repo.repo_id)
743
744	# Two new commits introducing a CD pair
745	c3, c4 = _cid(), _cid()
746	await _seed_commit(db_session, repo.repo_id, c3, [c2])
747	await _seed_commit(db_session, repo.repo_id, c4, [c3])
748	for cid in [c3, c4]:
749	await _seed_history(db_session, repo.repo_id, cid, [
750	"src/c.py::fn_c", "src/d.py::fn_d",
751	])
752	await db_session.commit()
753	await _run_provider(db_session, repo.repo_id, c4)
754	after = await _fetch_pairs(db_session, repo.repo_id)
755	assert len(after) > len(before)
756
757	@pytest.mark.asyncio
758	async def test_ET_34_no_duplicate_pairs(
759	self, db_session: AsyncSession, repo: MusehubRepo
760	) -> None:
761	"""No (symbol_a, symbol_b) duplicate rows for the same repo."""
762	c1, c2, c3 = _cid(), _cid(), _cid()
763	await _seed_commit(db_session, repo.repo_id, c1)
764	await _seed_commit(db_session, repo.repo_id, c2, [c1])
765	await _seed_commit(db_session, repo.repo_id, c3, [c2])
766	for cid in [c1, c2, c3]:
767	await _seed_history(db_session, repo.repo_id, cid, [
768	"src/a.py::fn_a", "src/b.py::fn_b",
769	])
770	await db_session.commit()
771	for _ in range(3):
772	await _run_provider(db_session, repo.repo_id, c3)
773	pairs = await _fetch_pairs(db_session, repo.repo_id)
774	keys = [(p.symbol_a, p.symbol_b) for p in pairs]
775	assert len(keys) == len(set(keys))
776
777	@pytest.mark.asyncio
778	async def test_ET_35_rate_updates_on_new_commits(
779	self, db_session: AsyncSession, repo: MusehubRepo
780	) -> None:
781	"""Rate increases when more co-change commits are added."""
782	# Initial: A in 3 commits, B in 3 commits, co=2 → rate=2/3
783	commits = [_cid() for _ in range(3)]
784	prev = None
785	for cid in commits:
786	await _seed_commit(db_session, repo.repo_id, cid, [prev] if prev else [])
787	prev = cid
788	# A appears in all 3
789	for cid in commits:
790	await _seed_history(db_session, repo.repo_id, cid, ["src/a.py::fn_a"])
791	# B co-changes only in first 2
792	for cid in commits[:2]:
793	await _seed_history(db_session, repo.repo_id, cid, ["src/b.py::fn_b"])
794	await db_session.commit()
795	await _run_provider(db_session, repo.repo_id, commits[-1])
796	pairs_before = await _fetch_pairs(db_session, repo.repo_id)
797	rate_before = pairs_before[0].co_change_rate if pairs_before else 0.0
798
799	# Now add a commit where both co-change again
800	c_new = _cid()
801	await _seed_commit(db_session, repo.repo_id, c_new, [commits[-1]])
802	await _seed_history(db_session, repo.repo_id, c_new, [
803	"src/a.py::fn_a", "src/b.py::fn_b",
804	])
805	await db_session.commit()
806	await _run_provider(db_session, repo.repo_id, c_new)
807	pairs_after = await _fetch_pairs(db_session, repo.repo_id)
808	rate_after = pairs_after[0].co_change_rate if pairs_after else 0.0
809	assert rate_after >= rate_before
810
811	@pytest.mark.asyncio
812	async def test_ET_36_truncated_flag_true_when_over_cap(
813	self, db_session: AsyncSession, repo: MusehubRepo
814	) -> None:
815	"""truncated=True when more pairs were found than MAX_PAIRS."""
816	provider = EntangleProvider()
817	commits = [_cid() for _ in range(3)]
818	prev = None
819	for cid in commits:
820	await _seed_commit(db_session, repo.repo_id, cid, [prev] if prev else [])
821	prev = cid
822	# 35 files × 2 syms → ~2415 cross-file pairs, exceeds MAX_PAIRS=500
823	addrs = [f"src/file_{i}.py::fn_{j}" for i in range(35) for j in range(2)]
824	for cid in commits:
825	await _seed_history(db_session, repo.repo_id, cid, addrs)
826	await db_session.commit()
827	result = await _run_provider(db_session, repo.repo_id, commits[-1])
828	key, payload = result[0]
829	assert payload["truncated"] is True
830
831
832	# ─────────────────────────────────────────────────────────────────────────────
833	# Tier 6 — Integrity: edge cases and data quality
834	# ─────────────────────────────────────────────────────────────────────────────
835
836	class TestEntangleIntegrity:
837
838	@pytest.mark.asyncio
839	async def test_ET_37_address_with_only_import_produces_no_pair(
840	self, db_session: AsyncSession, repo: MusehubRepo
841	) -> None:
842	"""A commit with only import pseudo-symbols generates no pair rows."""
843	c1, c2 = _cid(), _cid()
844	await _seed_commit(db_session, repo.repo_id, c1)
845	await _seed_commit(db_session, repo.repo_id, c2, [c1])
846	for cid in [c1, c2]:
847	await _seed_history(db_session, repo.repo_id, cid, [
848	"src/a.py::import::os",
849	"src/b.py::import::sys",
850	"src/c.py::import::typing",
851	])
852	await db_session.commit()
853	await _run_provider(db_session, repo.repo_id, c2)
854	pairs = await _fetch_pairs(db_session, repo.repo_id)
855	assert pairs == []
856
857	@pytest.mark.asyncio
858	async def test_ET_38_mixed_valid_and_import_symbols(
859	self, db_session: AsyncSession, repo: MusehubRepo
860	) -> None:
861	"""Import symbols in same commit as real symbols don't pair with real ones."""
862	c1, c2 = _cid(), _cid()
863	await _seed_commit(db_session, repo.repo_id, c1)
864	await _seed_commit(db_session, repo.repo_id, c2, [c1])
865	for cid in [c1, c2]:
866	await _seed_history(db_session, repo.repo_id, cid, [
867	"src/a.py::real_fn",
868	"src/b.py::import::os", # filtered
869	"src/c.py::other_fn",
870	])
871	await db_session.commit()
872	await _run_provider(db_session, repo.repo_id, c2)
873	pairs = await _fetch_pairs(db_session, repo.repo_id)
874	for p in pairs:
875	assert "::import::" not in p.symbol_a
876	assert "::import::" not in p.symbol_b
877
878	@pytest.mark.asyncio
879	async def test_ET_39_unknown_ref_in_bfs_returns_empty(
880	self, db_session: AsyncSession, repo: MusehubRepo
881	) -> None:
882	"""BFS from unknown ref produces no pairs (ref not in commit table)."""
883	unknown_ref = _cid()
884	result = await _run_provider(db_session, repo.repo_id, unknown_ref)
885	assert result == []
886
887	@pytest.mark.asyncio
888	async def test_ET_40_co_changes_count_exact(
889	self, db_session: AsyncSession, repo: MusehubRepo
890	) -> None:
891	"""co_changes is the exact number of commits where both symbols appeared."""
892	n_together = 4
893	n_solo_a = 2
894	commits_together = [_cid() for _ in range(n_together)]
895	commits_a_only = [_cid() for _ in range(n_solo_a)]
896	all_commits = commits_together + commits_a_only
897	prev = None
898	for cid in all_commits:
899	await _seed_commit(db_session, repo.repo_id, cid, [prev] if prev else [])
900	prev = cid
901	for cid in commits_together:
902	await _seed_history(db_session, repo.repo_id, cid, [
903	"src/a.py::fn_a", "src/b.py::fn_b",
904	])
905	for cid in commits_a_only:
906	await _seed_history(db_session, repo.repo_id, cid, ["src/a.py::fn_a"])
907	await db_session.commit()
908	await _run_provider(db_session, repo.repo_id, all_commits[-1])
909	pairs = await _fetch_pairs(db_session, repo.repo_id)
910	assert len(pairs) == 1
911	# union = count_a + count_b - co_changes = (n_together + n_solo_a) + n_together - n_together
912	union = n_together + n_solo_a # = 6
913	assert pairs[0].co_changes == n_together
914	assert pairs[0].commits_both_active == union
915	assert pairs[0].co_change_rate == round(n_together / union, 6)
916
917	@pytest.mark.asyncio
918	async def test_ET_41_rate_capped_at_one(
919	self, db_session: AsyncSession, repo: MusehubRepo
920	) -> None:
921	"""co_change_rate is never > 1.0."""
922	commits = [_cid() for _ in range(5)]
923	prev = None
924	for cid in commits:
925	await _seed_commit(db_session, repo.repo_id, cid, [prev] if prev else [])
926	prev = cid
927	for cid in commits:
928	await _seed_history(db_session, repo.repo_id, cid, [
929	"src/a.py::fn_a", "src/b.py::fn_b",
930	])
931	await db_session.commit()
932	await _run_provider(db_session, repo.repo_id, commits[-1])
933	pairs = await _fetch_pairs(db_session, repo.repo_id)
934	for p in pairs:
935	assert 0.0 <= p.co_change_rate <= 1.0
936
937
938	# ─────────────────────────────────────────────────────────────────────────────
939	# Tier 7 — Performance: timing bounds
940	# ─────────────────────────────────────────────────────────────────────────────
941
942	class TestEntanglePerformance:
943
944	@pytest.mark.asyncio
945	async def test_ET_42_ten_commits_ten_symbols_under_500ms(
946	self, db_session: AsyncSession, repo: MusehubRepo
947	) -> None:
948	"""10 commits × 10 symbols completes in under 500 ms."""
949	commits = [_cid() for _ in range(10)]
950	prev = None
951	for cid in commits:
952	await _seed_commit(db_session, repo.repo_id, cid, [prev] if prev else [])
953	prev = cid
954	addrs = [f"src/file_{i}.py::fn" for i in range(10)]
955	for cid in commits:
956	await _seed_history(db_session, repo.repo_id, cid, addrs)
957	await db_session.commit()
958	t0 = time.monotonic()
959	await _run_provider(db_session, repo.repo_id, commits[-1])
960	elapsed = time.monotonic() - t0
961	assert elapsed < 0.5, f"took {elapsed:.3f}s"
962
963	@pytest.mark.asyncio
964	async def test_ET_43_100_commits_20_symbols_under_2s(
965	self, db_session: AsyncSession, repo: MusehubRepo
966	) -> None:
967	"""100 commits × 20 symbols completes in under 2 s."""
968	commits = [_cid() for _ in range(100)]
969	prev = None
970	for cid in commits:
971	await _seed_commit(db_session, repo.repo_id, cid, [prev] if prev else [])
972	prev = cid
973	addrs = [f"src/f{i}.py::fn" for i in range(20)]
974	for cid in commits:
975	await _seed_history(db_session, repo.repo_id, cid, addrs)
976	await db_session.commit()
977	t0 = time.monotonic()
978	await _run_provider(db_session, repo.repo_id, commits[-1])
979	elapsed = time.monotonic() - t0
980	assert elapsed < 2.0, f"took {elapsed:.3f}s"
981
982	@pytest.mark.asyncio
983	async def test_ET_44_empty_repo_under_50ms(
984	self, db_session: AsyncSession, repo: MusehubRepo
985	) -> None:
986	"""Empty repo fast-path exits under 50 ms."""
987	t0 = time.monotonic()
988	await _run_provider(db_session, repo.repo_id, _cid())
989	elapsed = time.monotonic() - t0
990	assert elapsed < 0.05, f"took {elapsed:.3f}s"
991
992	@pytest.mark.asyncio
993	async def test_ET_45_rerun_same_speed_as_first(
994	self, db_session: AsyncSession, repo: MusehubRepo
995	) -> None:
996	"""Second run is not significantly slower than first run."""
997	c1, c2 = _cid(), _cid()
998	await _seed_commit(db_session, repo.repo_id, c1)
999	await _seed_commit(db_session, repo.repo_id, c2, [c1])
1000	for cid in [c1, c2]:
1001	await _seed_history(db_session, repo.repo_id, cid, [
1002	"src/a.py::fn_a", "src/b.py::fn_b",
1003	])
1004	await db_session.commit()
1005	t1 = time.monotonic()
1006	await _run_provider(db_session, repo.repo_id, c2)
1007	d1 = time.monotonic() - t1
1008	t2 = time.monotonic()
1009	await _run_provider(db_session, repo.repo_id, c2)
1010	d2 = time.monotonic() - t2
1011	# second run should not be more than 5× slower
1012	assert d2 < max(d1 * 5, 0.5)
1013
1014	@pytest.mark.asyncio
1015	async def test_ET_46_point_lookup_fast(
1016	self, db_session: AsyncSession, repo: MusehubRepo
1017	) -> None:
1018	"""Fetching pairs for a specific repo is sub-10 ms after provider run."""
1019	c1, c2 = _cid(), _cid()
1020	await _seed_commit(db_session, repo.repo_id, c1)
1021	await _seed_commit(db_session, repo.repo_id, c2, [c1])
1022	for cid in [c1, c2]:
1023	await _seed_history(db_session, repo.repo_id, cid, [
1024	"src/a.py::fn_a", "src/b.py::fn_b",
1025	])
1026	await db_session.commit()
1027	await _run_provider(db_session, repo.repo_id, c2)
1028	t0 = time.monotonic()
1029	await _fetch_pairs(db_session, repo.repo_id)
1030	elapsed = time.monotonic() - t0
1031	assert elapsed < 0.01, f"took {elapsed:.3f}s"
1032
1033
1034	# ─────────────────────────────────────────────────────────────────────────────
1035	# Tier 8 — Security: injection, isolation, address length
1036	# ─────────────────────────────────────────────────────────────────────────────
1037
1038	class TestEntangleSecurity:
1039
1040	@pytest.mark.asyncio
1041	async def test_ET_47_sql_injection_in_address_stored_verbatim(
1042	self, db_session: AsyncSession, repo: MusehubRepo
1043	) -> None:
1044	"""SQL injection strings in symbol addresses are stored as-is (no execution)."""
1045	inject = "src/a.py::fn'; DROP TABLE musehub_intel_entangle; --"
1046	c1, c2 = _cid(), _cid()
1047	await _seed_commit(db_session, repo.repo_id, c1)
1048	await _seed_commit(db_session, repo.repo_id, c2, [c1])
1049	for cid in [c1, c2]:
1050	await _seed_history(db_session, repo.repo_id, cid, [
1051	inject,
1052	"src/b.py::fn_b",
1053	])
1054	await db_session.commit()
1055	await _run_provider(db_session, repo.repo_id, c2)
1056	# Table must still exist
1057	pairs = await _fetch_pairs(db_session, repo.repo_id)
1058	# The injection address should appear verbatim or be stored without issue
1059	assert isinstance(pairs, list)
1060
1061	@pytest.mark.asyncio
1062	async def test_ET_48_xss_payload_in_address_stored_safely(
1063	self, db_session: AsyncSession, repo: MusehubRepo
1064	) -> None:
1065	"""XSS payloads in addresses are stored without execution."""
1066	xss = "src/<script>alert(1)</script>.py::fn"
1067	c1, c2 = _cid(), _cid()
1068	await _seed_commit(db_session, repo.repo_id, c1)
1069	await _seed_commit(db_session, repo.repo_id, c2, [c1])
1070	for cid in [c1, c2]:
1071	await _seed_history(db_session, repo.repo_id, cid, [
1072	xss,
1073	"src/b.py::fn_b",
1074	])
1075	await db_session.commit()
1076	await _run_provider(db_session, repo.repo_id, c2)
1077	pairs = await _fetch_pairs(db_session, repo.repo_id)
1078	assert isinstance(pairs, list)
1079
1080	@pytest.mark.asyncio
1081	async def test_ET_49_repo_isolation_strict(
1082	self, db_session: AsyncSession, two_repos: tuple[MusehubRepo, MusehubRepo]
1083	) -> None:
1084	"""Pairs from repo A are never visible when querying repo B."""
1085	r1, r2 = two_repos
1086	c1, c2 = _cid(), _cid()
1087	await _seed_commit(db_session, r1.repo_id, c1)
1088	await _seed_commit(db_session, r1.repo_id, c2, [c1])
1089	for cid in [c1, c2]:
1090	await _seed_history(db_session, r1.repo_id, cid, [
1091	"src/a.py::fn_a", "src/b.py::fn_b",
1092	])
1093	await db_session.commit()
1094	await _run_provider(db_session, r1.repo_id, c2)
1095	# Repo 2 has no data
1096	pairs_r2 = await _fetch_pairs(db_session, r2.repo_id)
1097	assert pairs_r2 == []
1098
1099	@pytest.mark.asyncio
1100	async def test_ET_50_repo_isolation_no_cross_contamination(
1101	self, db_session: AsyncSession, two_repos: tuple[MusehubRepo, MusehubRepo]
1102	) -> None:
1103	"""Two repos each get their own independent pair sets."""
1104	r1, r2 = two_repos
1105	for repo in [r1, r2]:
1106	c1, c2 = _cid(), _cid()
1107	await _seed_commit(db_session, repo.repo_id, c1)
1108	await _seed_commit(db_session, repo.repo_id, c2, [c1])
1109	for cid in [c1, c2]:
1110	await _seed_history(db_session, repo.repo_id, cid, [
1111	"src/a.py::fn_a", "src/b.py::fn_b",
1112	])
1113	await db_session.commit()
1114	await _run_provider(db_session, repo.repo_id, c2)
1115	pairs_r1 = await _fetch_pairs(db_session, r1.repo_id)
1116	pairs_r2 = await _fetch_pairs(db_session, r2.repo_id)
1117	assert len(pairs_r1) == 1
1118	assert len(pairs_r2) == 1
1119	assert pairs_r1[0].repo_id == r1.repo_id
1120	assert pairs_r2[0].repo_id == r2.repo_id
1121
1122	@pytest.mark.asyncio
1123	async def test_ET_51_delete_old_provider_run_on_rerun(
1124	self, db_session: AsyncSession, repo: MusehubRepo
1125	) -> None:
1126	"""Rerun for a different ref purges all previous rows for the repo."""
1127	c1, c2, c3 = _cid(), _cid(), _cid()
1128	await _seed_commit(db_session, repo.repo_id, c1)
1129	await _seed_commit(db_session, repo.repo_id, c2, [c1])
1130	await _seed_commit(db_session, repo.repo_id, c3, [c2])
1131	for cid in [c1, c2, c3]:
1132	await _seed_history(db_session, repo.repo_id, cid, [
1133	"src/a.py::fn_a", "src/b.py::fn_b",
1134	])
1135	await db_session.commit()
1136	await _run_provider(db_session, repo.repo_id, c2)
1137	await _run_provider(db_session, repo.repo_id, c3)
1138	pairs = await _fetch_pairs(db_session, repo.repo_id)
1139	# All stored rows must point to the latest ref
1140	for p in pairs:
1141	assert p.ref == c3
1142
1143	@pytest.mark.asyncio
1144	async def test_ET_52_unicode_in_address_handled(
1145	self, db_session: AsyncSession, repo: MusehubRepo
1146	) -> None:
1147	"""Unicode characters in addresses do not crash the provider."""
1148	c1, c2 = _cid(), _cid()
1149	await _seed_commit(db_session, repo.repo_id, c1)
1150	await _seed_commit(db_session, repo.repo_id, c2, [c1])
1151	for cid in [c1, c2]:
1152	await _seed_history(db_session, repo.repo_id, cid, [
1153	"src/música.py::canción",
1154	"src/b.py::fn_b",
1155	])
1156	await db_session.commit()
1157	await _run_provider(db_session, repo.repo_id, c2)
1158	pairs = await _fetch_pairs(db_session, repo.repo_id)
1159	assert isinstance(pairs, list)
1160
1161	@pytest.mark.asyncio
1162	async def test_ET_53_long_address_does_not_exceed_column_width(
1163	self, db_session: AsyncSession, repo: MusehubRepo
1164	) -> None:
1165	"""Addresses truncated to 512 chars by the route layer don't crash storage."""
1166	long_addr_a = "src/" + "a" * 500 + ".py::fn"
1167	long_addr_b = "src/" + "b" * 500 + ".py::fn"
1168	# These exceed 512 chars — simulate what the route-layer would see
1169	# The provider itself stores verbatim; the model column is VARCHAR(512)
1170	# so the DB will reject anything longer. Just verify the provider
1171	# doesn't crash on realistic (under 512) addresses.
1172	addr_a = f"{long_addr_a[:100]}::fn_a"
1173	addr_b = f"{long_addr_b[:100]}::fn_b"
1174	c1, c2 = _cid(), _cid()
1175	await _seed_commit(db_session, repo.repo_id, c1)
1176	await _seed_commit(db_session, repo.repo_id, c2, [c1])
1177	for cid in [c1, c2]:
1178	await _seed_history(db_session, repo.repo_id, cid, [addr_a, addr_b])
1179	await db_session.commit()
1180	await _run_provider(db_session, repo.repo_id, c2)
1181	pairs = await _fetch_pairs(db_session, repo.repo_id)
1182	assert len(pairs) == 1
1183
1184	@pytest.mark.asyncio
1185	async def test_ET_54_newline_in_address_stored_verbatim(
1186	self, db_session: AsyncSession, repo: MusehubRepo
1187	) -> None:
1188	"""Newline characters in addresses don't trigger injections or errors."""
1189	addr_a = "src/a.py::fn\n_a"
1190	addr_b = "src/b.py::fn_b"
1191	c1, c2 = _cid(), _cid()
1192	await _seed_commit(db_session, repo.repo_id, c1)
1193	await _seed_commit(db_session, repo.repo_id, c2, [c1])
1194	for cid in [c1, c2]:
1195	await _seed_history(db_session, repo.repo_id, cid, [addr_a, addr_b])
1196	await db_session.commit()
1197	await _run_provider(db_session, repo.repo_id, c2)
1198	pairs = await _fetch_pairs(db_session, repo.repo_id)
1199	assert isinstance(pairs, list)

File History 1 commit

sha256:ef10830ce231e0a20efcb0e2586cb879471247e916616e6fdd0d51df459e2595 fix: typing audit — 0 violations, 0 untyped defs across all… Sonnet 4.6 minor ⚠ 20 days ago

patch test_entangle_provider.py 20 days ago

replace MusehubCommit 20 days ago

replace MusehubCommitRef 20 days ago

insert MusehubRepo 20 days ago

replace repo 20 days ago

replace TestEntangleE2E 20 days ago

replace TestEntangleE2E.test_ET_19_three_symbol_pair_ranking 20 days ago

replace TestEntangleE2E.test_ET_20_a_in_test_flag_set_for_test_files 20 days ago

replace TestEntangleE2E.test_ET_21_result_metadata_keys 20 days ago

replace TestEntangleE2E.test_ET_22_ref_stored_on_pair_row 20 days ago

replace TestEntangleE2E.test_ET_23_multiple_disconnected_pairs 20 days ago

replace TestEntangleE2E.test_ET_24_structurally_linked_defaults_false 20 days ago

replace TestEntangleE2E.test_ET_25_same_file_false_on_stored_pair 20 days ago

replace TestEntangleIntegration 20 days ago

replace TestEntangleIntegration.test_ET_09_empty_repo_returns_empty 20 days ago

replace TestEntangleIntegration.test_ET_10_no_history_entries_returns_empty 20 days ago

replace TestEntangleIntegration.test_ET_11_two_symbols_in_one_commit_no_pair 20 days ago

replace TestEntangleIntegration.test_ET_12_two_co_changes_produces_one_pair 20 days ago

replace TestEntangleIntegration.test_ET_13_import_symbols_excluded 20 days ago

replace TestEntangleIntegration.test_ET_14_bare_path_addresses_excluded 20 days ago

replace TestEntangleIntegration.test_ET_15_same_file_pair_excluded 20 days ago

replace TestEntangleIntegration.test_ET_16_pair_stored_canonical_a_lt_b 20 days ago

replace TestEntangleIntegration.test_ET_17_file_a_b_populated 20 days ago

replace TestEntangleIntegration.test_ET_18_commits_both_active_is_min 20 days ago

replace TestEntangleIntegrity 20 days ago

replace TestEntangleIntegrity.test_ET_37_address_with_only_import_produces_no_pair 20 days ago

replace TestEntangleIntegrity.test_ET_38_mixed_valid_and_import_symbols 20 days ago

replace TestEntangleIntegrity.test_ET_39_unknown_ref_in_bfs_returns_empty 20 days ago

replace TestEntangleIntegrity.test_ET_40_co_changes_count_exact 20 days ago

replace TestEntangleIntegrity.test_ET_41_rate_capped_at_one 20 days ago

replace TestEntanglePerformance 20 days ago

replace TestEntanglePerformance.test_ET_42_ten_commits_ten_symbols_under_500ms 20 days ago

replace TestEntanglePerformance.test_ET_43_100_commits_20_symbols_under_2s 20 days ago

replace TestEntanglePerformance.test_ET_44_empty_repo_under_50ms 20 days ago

replace TestEntanglePerformance.test_ET_45_rerun_same_speed_as_first 20 days ago

replace TestEntanglePerformance.test_ET_46_point_lookup_fast 20 days ago

replace TestEntangleSecurity 20 days ago

replace TestEntangleSecurity.test_ET_47_sql_injection_in_address_stored_verbatim 20 days ago

replace TestEntangleSecurity.test_ET_48_xss_payload_in_address_stored_safely 20 days ago

replace TestEntangleSecurity.test_ET_49_repo_isolation_strict 20 days ago

replace TestEntangleSecurity.test_ET_50_repo_isolation_no_cross_contamination 20 days ago

replace TestEntangleSecurity.test_ET_51_delete_old_provider_run_on_rerun 20 days ago

replace TestEntangleSecurity.test_ET_52_unicode_in_address_handled 20 days ago

replace TestEntangleSecurity.test_ET_53_long_address_does_not_exceed_column_width 20 days ago

replace TestEntangleSecurity.test_ET_54_newline_in_address_stored_verbatim 20 days ago

replace TestEntangleState 20 days ago

replace TestEntangleState.test_ET_31_idempotent_rerun_same_rows 20 days ago

replace TestEntangleState.test_ET_32_stale_rows_purged_on_rerun 20 days ago

replace TestEntangleState.test_ET_33_incremental_new_pair_appears 20 days ago

replace TestEntangleState.test_ET_34_no_duplicate_pairs 20 days ago

Pathtests/test_entangle_provider.py

Lines1,199

Size52.0 KB

LangPython

Refsha256:3ff9c9863a9891bdcde71b4a43228f66d0493e38b7cc1d09fe9eb7de774046b2

Object ID

sha256:89484a2da5a1a6d2780f39bbda2968b44dd675c0b8797309953e238d0c11ce1a…

Last commit

sha256:ef10830ce231e0a20efcb0e2586cb879471247e916616e6fdd0d51df459e2595

fix: typing audit — 0 violations, 0 untyped defs …

20 days ago

Quick links

Blame History