gabriel / musehub public
test_intel_normalized_schema.py python
841 lines 36.9 KB
Raw
sha256:0997d6250ae6476362f6fe2025af7789f46d03df3e9f34356d5e8ee79b201923 fix(issues): use issue number as pagination cursor, not cre… Sonnet 4.6 patch 7 days ago
1 """TDD spec for normalized symbol intel schema — SI1–SI40.
2
3 Current architecture stores per-symbol data as unbounded JSON blobs in
4 musehub_intel_results.data_json. Every symbol page load deserializes
5 megabytes of JSON to return one entry. This test file defines the
6 correct normalized architecture.
7
8 All tests are RED until the implementation is complete.
9
10 New tables (replaces code.symbol_history / code.per_symbol_intel / code.hash_occurrence blobs):
11
12 musehub_symbol_history_entries — one row per (repo_id, address, commit_id)
13 musehub_symbol_intel — one row per (repo_id, address)
14 musehub_hash_occurrence_entries — one row per (content_id, repo_id, address)
15
16 musehub_intel_results keeps only:
17 code.intel_summary — small scalar aggregate, fine as blob
18 code.intel_snapshot — computed panel data, fine as blob
19
20 Layers:
21 1. Schema — ORM model shape, column types, indexes, constraints
22 2. Write — build_symbol_index upserts normalized rows
23 3. Read — helpers return correct data from normalized tables
24 4. Incremental— second push merges without duplication
25 5. Integrity — corrupt data, unknown refs, empty repos
26 6. Performance— point lookups sub-millisecond; no full-table deserialize
27 7. Stress — 500 symbols × 50 commits each
28 8. Aggregates — intel_summary and intel_snapshot still produced as blobs
29 """
30 from __future__ import annotations
31
32 import json
33 import secrets
34 import time
35 from datetime import datetime, timezone
36
37 import pytest
38 from sqlalchemy import select, func
39 from sqlalchemy.ext.asyncio import AsyncSession
40
41 from muse.core.types import long_id
42 from tests.factories import create_repo
43 from musehub.types.json_types import JSONObject
44
45
46 # ─────────────────────────────────────────────────────────────────────────────
47 # Helpers
48 # ─────────────────────────────────────────────────────────────────────────────
49
50 def _now() -> datetime:
51 return datetime.now(tz=timezone.utc)
52
53
54 def _uid() -> str:
55 return long_id(secrets.token_hex(32))
56
57
58 def _cid() -> str:
59 return long_id(secrets.token_hex(32))
60
61
62 def _insert_op(address: str, content_id: str | None = None) -> JSONObject:
63 return {
64 "address": address,
65 "op": "insert",
66 "content_id": content_id or _cid(),
67 }
68
69
70 def _patch_op(file_addr: str, children: list[JSONObject]) -> JSONObject:
71 return {"address": file_addr, "op": "patch", "child_ops": children}
72
73
74 async def _commit_with_delta(
75 session: AsyncSession,
76 repo_id: str,
77 commit_id: str,
78 ops: list[JSONObject],
79 parent_ids: list[str] | None = None,
80 author: str = "gabriel",
81 ) -> None:
82 from musehub.db.musehub_repo_models import MusehubCommit, MusehubCommitRef
83 c = MusehubCommit(
84 commit_id=commit_id,
85 branch="main",
86 parent_ids=parent_ids or [],
87 message="test commit",
88 author=author,
89 timestamp=_now(),
90 structured_delta={"ops": ops},
91 )
92 session.add(c)
93 session.add(MusehubCommitRef(repo_id=repo_id, commit_id=commit_id))
94 await session.flush()
95
96
97 async def _build_and_persist(
98 session: AsyncSession,
99 repo_id: str,
100 commit_id: str,
101 ) -> list[tuple[str, dict]]:
102 from musehub.services.musehub_symbol_indexer import build_symbol_index
103 from musehub.services.musehub_intel_providers import persist_intel_results
104 results = await build_symbol_index(session, repo_id, commit_id)
105 if results:
106 await persist_intel_results(session, repo_id, commit_id, results)
107 return results
108
109
110 # ─────────────────────────────────────────────────────────────────────────────
111 # Layer 1 — Schema: ORM model shape
112 # ─────────────────────────────────────────────────────────────────────────────
113
114 class TestNormalizedSchemaModels:
115 """SI1–SI6: ORM models for the three new normalized tables exist and have
116 the right columns, primary keys, and indexes."""
117
118 def test_SI1_symbol_history_entry_model_importable(self) -> None:
119 from musehub.db.musehub_intel_models import MusehubSymbolHistoryEntry
120 assert MusehubSymbolHistoryEntry.__tablename__ == "musehub_symbol_history_entries"
121
122 def test_SI2_symbol_history_entry_columns(self) -> None:
123 from musehub.db.musehub_intel_models import MusehubSymbolHistoryEntry
124 cols = {c.name for c in MusehubSymbolHistoryEntry.__table__.columns}
125 assert {"repo_id", "address", "commit_id", "committed_at",
126 "author", "op", "content_id"} <= cols
127
128 def test_SI3_symbol_history_entry_pk(self) -> None:
129 from musehub.db.musehub_intel_models import MusehubSymbolHistoryEntry
130 pk_cols = {c.name for c in MusehubSymbolHistoryEntry.__table__.primary_key}
131 assert pk_cols == {"repo_id", "address", "commit_id"}
132
133 def test_SI4_symbol_intel_model_importable(self) -> None:
134 from musehub.db.musehub_intel_models import MusehubSymbolIntel
135 assert MusehubSymbolIntel.__tablename__ == "musehub_symbol_intel"
136
137 def test_SI5_symbol_intel_columns(self) -> None:
138 from musehub.db.musehub_intel_models import MusehubSymbolIntel
139 cols = {c.name for c in MusehubSymbolIntel.__table__.columns}
140 assert {"repo_id", "address", "churn", "churn_30d", "churn_90d",
141 "blast", "blast_direct", "blast_cross", "blast_top",
142 "last_changed", "last_author", "author_count",
143 "gravity", "weekly"} <= cols
144
145 def test_SI6_symbol_intel_pk(self) -> None:
146 from musehub.db.musehub_intel_models import MusehubSymbolIntel
147 pk_cols = {c.name for c in MusehubSymbolIntel.__table__.primary_key}
148 assert pk_cols == {"repo_id", "address"}
149
150 def test_SI7_hash_occurrence_entry_model_importable(self) -> None:
151 from musehub.db.musehub_intel_models import MusehubHashOccurrenceEntry
152 assert MusehubHashOccurrenceEntry.__tablename__ == "musehub_hash_occurrence_entries"
153
154 def test_SI8_hash_occurrence_entry_pk(self) -> None:
155 from musehub.db.musehub_intel_models import MusehubHashOccurrenceEntry
156 pk_cols = {c.name for c in MusehubHashOccurrenceEntry.__table__.primary_key}
157 assert pk_cols == {"content_id", "repo_id", "address"}
158
159
160 # ─────────────────────────────────────────────────────────────────────────────
161 # Layer 2 — Write: build_symbol_index upserts normalized rows
162 # ─────────────────────────────────────────────────────────────────────────────
163
164 class TestBuildWritesNormalizedRows:
165 """SI9–SI16: build_symbol_index + persist_intel_results write to the
166 normalized tables, not just to intel_results blobs."""
167
168 @pytest.mark.asyncio
169 async def test_SI9_single_commit_writes_history_entry_rows(
170 self, db_session: AsyncSession,
171 ) -> None:
172 from musehub.db.musehub_intel_models import MusehubSymbolHistoryEntry
173 repo = await create_repo(db_session)
174 commit_id = _uid()
175 await _commit_with_delta(db_session, repo.repo_id, commit_id, [
176 _insert_op("src/main.py::parse"),
177 _insert_op("src/main.py::render"),
178 ])
179 await _build_and_persist(db_session, repo.repo_id, commit_id)
180 rows = (await db_session.execute(
181 select(MusehubSymbolHistoryEntry).where(
182 MusehubSymbolHistoryEntry.repo_id == repo.repo_id
183 )
184 )).scalars().all()
185 addresses = {r.address for r in rows}
186 assert "src/main.py::parse" in addresses
187 assert "src/main.py::render" in addresses
188
189 @pytest.mark.asyncio
190 async def test_SI10_history_entry_commit_id_stored(
191 self, db_session: AsyncSession,
192 ) -> None:
193 from musehub.db.musehub_intel_models import MusehubSymbolHistoryEntry
194 repo = await create_repo(db_session)
195 commit_id = _uid()
196 await _commit_with_delta(db_session, repo.repo_id, commit_id, [
197 _insert_op("src/auth.py::login"),
198 ])
199 await _build_and_persist(db_session, repo.repo_id, commit_id)
200 row = (await db_session.execute(
201 select(MusehubSymbolHistoryEntry).where(
202 MusehubSymbolHistoryEntry.repo_id == repo.repo_id,
203 MusehubSymbolHistoryEntry.address == "src/auth.py::login",
204 )
205 )).scalar_one()
206 assert row.commit_id == commit_id
207 assert row.op in ("add", "insert", "modify")
208
209 @pytest.mark.asyncio
210 async def test_SI11_single_commit_writes_symbol_intel_rows(
211 self, db_session: AsyncSession,
212 ) -> None:
213 from musehub.db.musehub_intel_models import MusehubSymbolIntel
214 repo = await create_repo(db_session)
215 commit_id = _uid()
216 await _commit_with_delta(db_session, repo.repo_id, commit_id, [
217 _insert_op("src/core.py::Engine"),
218 ])
219 await _build_and_persist(db_session, repo.repo_id, commit_id)
220 row = (await db_session.execute(
221 select(MusehubSymbolIntel).where(
222 MusehubSymbolIntel.repo_id == repo.repo_id,
223 MusehubSymbolIntel.address == "src/core.py::Engine",
224 )
225 )).scalar_one_or_none()
226 assert row is not None
227 assert row.churn >= 1
228
229 @pytest.mark.asyncio
230 async def test_SI12_hash_occurrence_rows_written(
231 self, db_session: AsyncSession,
232 ) -> None:
233 from musehub.db.musehub_intel_models import MusehubHashOccurrenceEntry
234 repo = await create_repo(db_session)
235 commit_id = _uid()
236 shared_content_id = _cid()
237 await _commit_with_delta(db_session, repo.repo_id, commit_id, [
238 _insert_op("src/a.py::foo", shared_content_id),
239 _insert_op("src/b.py::bar", shared_content_id),
240 ])
241 await _build_and_persist(db_session, repo.repo_id, commit_id)
242 rows = (await db_session.execute(
243 select(MusehubHashOccurrenceEntry).where(
244 MusehubHashOccurrenceEntry.repo_id == repo.repo_id,
245 MusehubHashOccurrenceEntry.content_id == shared_content_id,
246 )
247 )).scalars().all()
248 addrs = {r.address for r in rows}
249 assert "src/a.py::foo" in addrs
250 assert "src/b.py::bar" in addrs
251
252 @pytest.mark.asyncio
253 async def test_SI13_intel_summary_still_written_to_intel_results(
254 self, db_session: AsyncSession,
255 ) -> None:
256 from musehub.db.musehub_intel_models import MusehubIntelResult
257 repo = await create_repo(db_session)
258 commit_id = _uid()
259 await _commit_with_delta(db_session, repo.repo_id, commit_id, [
260 _insert_op("src/main.py::run"),
261 ])
262 await _build_and_persist(db_session, repo.repo_id, commit_id)
263 row = (await db_session.execute(
264 select(MusehubIntelResult).where(
265 MusehubIntelResult.repo_id == repo.repo_id,
266 MusehubIntelResult.intel_type == "code.intel_summary",
267 )
268 )).scalar_one_or_none()
269 assert row is not None
270 data = json.loads(row.data_json)
271 assert "health_score" in data
272
273 @pytest.mark.asyncio
274 async def test_SI14_intel_snapshot_still_written_to_intel_results(
275 self, db_session: AsyncSession,
276 ) -> None:
277 from musehub.db.musehub_intel_models import MusehubIntelResult
278 repo = await create_repo(db_session)
279 commit_id = _uid()
280 await _commit_with_delta(db_session, repo.repo_id, commit_id, [
281 _insert_op("src/main.py::run"),
282 ])
283 await _build_and_persist(db_session, repo.repo_id, commit_id)
284 row = (await db_session.execute(
285 select(MusehubIntelResult).where(
286 MusehubIntelResult.repo_id == repo.repo_id,
287 MusehubIntelResult.intel_type == "code.intel_snapshot",
288 )
289 )).scalar_one_or_none()
290 assert row is not None
291
292 @pytest.mark.asyncio
293 async def test_SI15_blob_types_not_written_to_intel_results(
294 self, db_session: AsyncSession,
295 ) -> None:
296 """code.symbol_history, code.per_symbol_intel, code.hash_occurrence
297 must NOT be written as blobs anymore."""
298 from musehub.db.musehub_intel_models import MusehubIntelResult
299 repo = await create_repo(db_session)
300 commit_id = _uid()
301 await _commit_with_delta(db_session, repo.repo_id, commit_id, [
302 _insert_op("src/main.py::run"),
303 ])
304 await _build_and_persist(db_session, repo.repo_id, commit_id)
305 blob_types = (await db_session.execute(
306 select(MusehubIntelResult.intel_type).where(
307 MusehubIntelResult.repo_id == repo.repo_id,
308 MusehubIntelResult.intel_type.in_([
309 "code.symbol_history",
310 "code.per_symbol_intel",
311 "code.hash_occurrence",
312 ])
313 )
314 )).scalars().all()
315 assert blob_types == [], f"blob types still written: {blob_types}"
316
317 @pytest.mark.asyncio
318 async def test_SI16_author_stored_in_history_entry(
319 self, db_session: AsyncSession,
320 ) -> None:
321 from musehub.db.musehub_intel_models import MusehubSymbolHistoryEntry
322 repo = await create_repo(db_session)
323 commit_id = _uid()
324 await _commit_with_delta(
325 db_session, repo.repo_id, commit_id,
326 [_insert_op("src/auth.py::validate")],
327 author="gabriel",
328 )
329 await _build_and_persist(db_session, repo.repo_id, commit_id)
330 row = (await db_session.execute(
331 select(MusehubSymbolHistoryEntry).where(
332 MusehubSymbolHistoryEntry.repo_id == repo.repo_id,
333 MusehubSymbolHistoryEntry.address == "src/auth.py::validate",
334 )
335 )).scalar_one()
336 assert row.author == "gabriel"
337
338
339 # ─────────────────────────────────────────────────────────────────────────────
340 # Layer 3 — Read: helpers return correct data from normalized tables
341 # ─────────────────────────────────────────────────────────────────────────────
342
343 class TestReadHelpers:
344 """SI17–SI24: read helpers query normalized tables, not blobs."""
345
346 @pytest.mark.asyncio
347 async def test_SI17_load_symbol_history_returns_entries_for_address(
348 self, db_session: AsyncSession,
349 ) -> None:
350 from musehub.services.musehub_symbol_indexer import load_symbol_history
351 repo = await create_repo(db_session)
352 commit_id = _uid()
353 await _commit_with_delta(db_session, repo.repo_id, commit_id, [
354 _insert_op("src/auth.py::login"),
355 _insert_op("src/core.py::Engine"),
356 ])
357 await _build_and_persist(db_session, repo.repo_id, commit_id)
358 history = await load_symbol_history(db_session, repo.repo_id)
359 assert "src/auth.py::login" in history
360 assert "src/core.py::Engine" in history
361
362 @pytest.mark.asyncio
363 async def test_SI18_load_symbol_history_file_path_filter(
364 self, db_session: AsyncSession,
365 ) -> None:
366 from musehub.services.musehub_symbol_indexer import load_symbol_history
367 repo = await create_repo(db_session)
368 commit_id = _uid()
369 await _commit_with_delta(db_session, repo.repo_id, commit_id, [
370 _insert_op("src/auth.py::login"),
371 _insert_op("src/auth.py::logout"),
372 _insert_op("src/core.py::Engine"),
373 ])
374 await _build_and_persist(db_session, repo.repo_id, commit_id)
375 history = await load_symbol_history(db_session, repo.repo_id, file_path="src/auth.py")
376 assert "src/auth.py::login" in history
377 assert "src/auth.py::logout" in history
378 assert "src/core.py::Engine" not in history
379
380 @pytest.mark.asyncio
381 async def test_SI19_load_symbol_history_empty_when_no_index(
382 self, db_session: AsyncSession,
383 ) -> None:
384 from musehub.services.musehub_symbol_indexer import load_symbol_history
385 repo = await create_repo(db_session)
386 history = await load_symbol_history(db_session, repo.repo_id)
387 assert history == {}
388
389 @pytest.mark.asyncio
390 async def test_SI20_lookup_symbol_intel_returns_metrics(
391 self, db_session: AsyncSession,
392 ) -> None:
393 from musehub.services.musehub_symbol_indexer import lookup_symbol_intel
394 repo = await create_repo(db_session)
395 commit_id = _uid()
396 await _commit_with_delta(db_session, repo.repo_id, commit_id, [
397 _insert_op("src/billing.py::compute_total"),
398 ])
399 await _build_and_persist(db_session, repo.repo_id, commit_id)
400 result = await lookup_symbol_intel(
401 db_session, repo.repo_id, ["src/billing.py::compute_total"]
402 )
403 assert "src/billing.py::compute_total" in result
404 intel = result["src/billing.py::compute_total"]
405 assert "churn" in intel
406 assert "gravity" in intel
407 assert "blast" in intel
408
409 @pytest.mark.asyncio
410 async def test_SI21_lookup_symbol_intel_missing_address_excluded(
411 self, db_session: AsyncSession,
412 ) -> None:
413 from musehub.services.musehub_symbol_indexer import lookup_symbol_intel
414 repo = await create_repo(db_session)
415 result = await lookup_symbol_intel(db_session, repo.repo_id, ["nonexistent::fn"])
416 assert result == {}
417
418 @pytest.mark.asyncio
419 async def test_SI22_load_hash_occurrence_returns_clone_pairs(
420 self, db_session: AsyncSession,
421 ) -> None:
422 from musehub.services.musehub_symbol_indexer import load_hash_occurrence
423 repo = await create_repo(db_session)
424 commit_id = _uid()
425 content_id = _cid()
426 await _commit_with_delta(db_session, repo.repo_id, commit_id, [
427 _insert_op("src/a.py::foo", content_id),
428 _insert_op("src/b.py::bar", content_id),
429 ])
430 await _build_and_persist(db_session, repo.repo_id, commit_id)
431 occurrence = await load_hash_occurrence(db_session, repo.repo_id)
432 assert content_id in occurrence
433 assert set(occurrence[content_id]) == {"src/a.py::foo", "src/b.py::bar"}
434
435 @pytest.mark.asyncio
436 async def test_SI23_load_intel_snapshot_still_works(
437 self, db_session: AsyncSession,
438 ) -> None:
439 from musehub.services.musehub_symbol_indexer import load_intel_snapshot
440 repo = await create_repo(db_session)
441 commit_id = _uid()
442 await _commit_with_delta(db_session, repo.repo_id, commit_id, [
443 _insert_op("src/main.py::run"),
444 ])
445 await _build_and_persist(db_session, repo.repo_id, commit_id)
446 snap = await load_intel_snapshot(db_session, repo.repo_id)
447 assert snap is not None
448
449 @pytest.mark.asyncio
450 async def test_SI24_get_index_meta_returns_correct_ref(
451 self, db_session: AsyncSession,
452 ) -> None:
453 from musehub.services.musehub_symbol_indexer import get_index_meta
454 repo = await create_repo(db_session)
455 commit_id = _uid()
456 await _commit_with_delta(db_session, repo.repo_id, commit_id, [
457 _insert_op("src/main.py::run"),
458 ])
459 await _build_and_persist(db_session, repo.repo_id, commit_id)
460 meta = await get_index_meta(db_session, repo.repo_id)
461 assert meta is not None
462 assert meta["ref"] == commit_id
463
464
465 # ─────────────────────────────────────────────────────────────────────────────
466 # Layer 4 — Incremental: second push merges without duplication
467 # ─────────────────────────────────────────────────────────────────────────────
468
469 class TestIncrementalUpdates:
470 """SI25–SI29: second push adds new rows, does not duplicate existing ones."""
471
472 @pytest.mark.asyncio
473 async def test_SI25_second_push_adds_new_history_entries(
474 self, db_session: AsyncSession,
475 ) -> None:
476 from musehub.db.musehub_intel_models import MusehubSymbolHistoryEntry
477 repo = await create_repo(db_session)
478 c1 = _uid()
479 await _commit_with_delta(db_session, repo.repo_id, c1, [
480 _insert_op("src/auth.py::login"),
481 ])
482 await _build_and_persist(db_session, repo.repo_id, c1)
483
484 c2 = _uid()
485 await _commit_with_delta(db_session, repo.repo_id, c2, [
486 _insert_op("src/auth.py::logout"),
487 ], parent_ids=[c1])
488 await _build_and_persist(db_session, repo.repo_id, c2)
489
490 rows = (await db_session.execute(
491 select(MusehubSymbolHistoryEntry).where(
492 MusehubSymbolHistoryEntry.repo_id == repo.repo_id
493 )
494 )).scalars().all()
495 addresses = {r.address for r in rows}
496 assert "src/auth.py::login" in addresses
497 assert "src/auth.py::logout" in addresses
498
499 @pytest.mark.asyncio
500 async def test_SI26_second_push_does_not_duplicate_existing_entries(
501 self, db_session: AsyncSession,
502 ) -> None:
503 from musehub.db.musehub_intel_models import MusehubSymbolHistoryEntry
504 repo = await create_repo(db_session)
505 c1 = _uid()
506 await _commit_with_delta(db_session, repo.repo_id, c1, [
507 _insert_op("src/core.py::Engine"),
508 ])
509 await _build_and_persist(db_session, repo.repo_id, c1)
510 # Re-build with same head — no new rows
511 await _build_and_persist(db_session, repo.repo_id, c1)
512
513 count = (await db_session.execute(
514 select(func.count()).select_from(MusehubSymbolHistoryEntry).where(
515 MusehubSymbolHistoryEntry.repo_id == repo.repo_id,
516 MusehubSymbolHistoryEntry.address == "src/core.py::Engine",
517 )
518 )).scalar_one()
519 assert count == 1
520
521 @pytest.mark.asyncio
522 async def test_SI27_modify_op_updates_symbol_intel_churn(
523 self, db_session: AsyncSession,
524 ) -> None:
525 from musehub.services.musehub_symbol_indexer import lookup_symbol_intel
526 repo = await create_repo(db_session)
527 c1 = _uid()
528 await _commit_with_delta(db_session, repo.repo_id, c1, [
529 _insert_op("src/core.py::Engine"),
530 ])
531 await _build_and_persist(db_session, repo.repo_id, c1)
532
533 c2 = _uid()
534 await _commit_with_delta(db_session, repo.repo_id, c2, [
535 {"address": "src/core.py::Engine", "op": "replace",
536 "content_id": _cid()},
537 ], parent_ids=[c1])
538 await _build_and_persist(db_session, repo.repo_id, c2)
539
540 intel = await lookup_symbol_intel(db_session, repo.repo_id, ["src/core.py::Engine"])
541 assert intel["src/core.py::Engine"]["churn"] == 2
542
543 @pytest.mark.asyncio
544 async def test_SI28_second_push_history_has_both_commit_ids(
545 self, db_session: AsyncSession,
546 ) -> None:
547 from musehub.db.musehub_intel_models import MusehubSymbolHistoryEntry
548 repo = await create_repo(db_session)
549 c1, c2 = _uid(), _uid()
550 await _commit_with_delta(db_session, repo.repo_id, c1, [
551 _insert_op("src/auth.py::login"),
552 ])
553 await _build_and_persist(db_session, repo.repo_id, c1)
554 await _commit_with_delta(db_session, repo.repo_id, c2, [
555 {"address": "src/auth.py::login", "op": "replace", "content_id": _cid()},
556 ], parent_ids=[c1])
557 await _build_and_persist(db_session, repo.repo_id, c2)
558
559 rows = (await db_session.execute(
560 select(MusehubSymbolHistoryEntry).where(
561 MusehubSymbolHistoryEntry.repo_id == repo.repo_id,
562 MusehubSymbolHistoryEntry.address == "src/auth.py::login",
563 )
564 )).scalars().all()
565 commit_ids = {r.commit_id for r in rows}
566 assert c1 in commit_ids
567 assert c2 in commit_ids
568
569 @pytest.mark.asyncio
570 async def test_SI29_intel_summary_ref_advances_after_second_push(
571 self, db_session: AsyncSession,
572 ) -> None:
573 from musehub.services.musehub_symbol_indexer import get_index_meta
574 repo = await create_repo(db_session)
575 c1, c2 = _uid(), _uid()
576 await _commit_with_delta(db_session, repo.repo_id, c1, [_insert_op("src/a.py::f")])
577 await _build_and_persist(db_session, repo.repo_id, c1)
578 await _commit_with_delta(db_session, repo.repo_id, c2, [_insert_op("src/b.py::g")], parent_ids=[c1])
579 await _build_and_persist(db_session, repo.repo_id, c2)
580
581 meta = await get_index_meta(db_session, repo.repo_id)
582 assert meta is not None
583 assert meta["ref"] == c2
584
585
586 # ─────────────────────────────────────────────────────────────────────────────
587 # Layer 5 — Integrity: corrupt data, unknown refs, empty repos
588 # ─────────────────────────────────────────────────────────────────────────────
589
590 class TestDataIntegrity:
591 """SI30–SI33: edge cases that must not raise or corrupt state."""
592
593 @pytest.mark.asyncio
594 async def test_SI30_empty_repo_returns_empty_history(
595 self, db_session: AsyncSession,
596 ) -> None:
597 from musehub.services.musehub_symbol_indexer import load_symbol_history
598 repo = await create_repo(db_session)
599 assert await load_symbol_history(db_session, repo.repo_id) == {}
600
601 @pytest.mark.asyncio
602 async def test_SI31_unknown_head_commit_returns_empty_results(
603 self, db_session: AsyncSession,
604 ) -> None:
605 from musehub.services.musehub_symbol_indexer import build_symbol_index
606 repo = await create_repo(db_session)
607 results = await build_symbol_index(db_session, repo.repo_id, _uid())
608 assert results == []
609
610 @pytest.mark.asyncio
611 async def test_SI32_commit_with_no_structured_delta_skipped(
612 self, db_session: AsyncSession,
613 ) -> None:
614 from musehub.db.musehub_intel_models import MusehubSymbolHistoryEntry
615 from musehub.db.musehub_repo_models import MusehubCommit, MusehubCommitRef
616 repo = await create_repo(db_session)
617 commit_id = _uid()
618 c = MusehubCommit(
619 commit_id=commit_id, branch="main",
620 parent_ids=[], message="no delta", author="gabriel",
621 timestamp=_now(),
622 )
623 db_session.add(c)
624 db_session.add(MusehubCommitRef(repo_id=repo.repo_id, commit_id=commit_id))
625 await db_session.flush()
626 await _build_and_persist(db_session, repo.repo_id, commit_id)
627
628 count = (await db_session.execute(
629 select(func.count()).select_from(MusehubSymbolHistoryEntry).where(
630 MusehubSymbolHistoryEntry.repo_id == repo.repo_id
631 )
632 )).scalar_one()
633 assert count == 0
634
635 @pytest.mark.asyncio
636 async def test_SI33_lookup_symbol_intel_empty_address_list(
637 self, db_session: AsyncSession,
638 ) -> None:
639 from musehub.services.musehub_symbol_indexer import lookup_symbol_intel
640 repo = await create_repo(db_session)
641 result = await lookup_symbol_intel(db_session, repo.repo_id, [])
642 assert result == {}
643
644
645 # ─────────────────────────────────────────────────────────────────────────────
646 # Layer 6 — Performance: point lookups do not deserialize blobs
647 # ─────────────────────────────────────────────────────────────────────────────
648
649 class TestPerformance:
650 """SI34–SI36: normalized reads are fast regardless of repo size.
651 These budgets would be impossible with the blob approach at scale."""
652
653 @pytest.mark.asyncio
654 async def test_SI34_single_symbol_lookup_under_50ms(
655 self, db_session: AsyncSession,
656 ) -> None:
657 from musehub.services.musehub_symbol_indexer import lookup_symbol_intel
658 repo = await create_repo(db_session)
659 # Build a 50-commit index
660 parent: list[str] = []
661 last_id = ""
662 for i in range(50):
663 cid = _uid()
664 await _commit_with_delta(db_session, repo.repo_id, cid, [
665 _insert_op(f"src/file_{i}.py::fn_{i}"),
666 _insert_op(f"src/file_{i}.py::helper_{i}"),
667 ], parent_ids=parent)
668 parent = [cid]
669 last_id = cid
670 await _build_and_persist(db_session, repo.repo_id, last_id)
671
672 target = "src/file_25.py::fn_25"
673 t0 = time.perf_counter()
674 result = await lookup_symbol_intel(db_session, repo.repo_id, [target])
675 elapsed_ms = (time.perf_counter() - t0) * 1000
676 assert target in result
677 assert elapsed_ms < 50, f"point lookup took {elapsed_ms:.1f}ms — too slow"
678
679 @pytest.mark.asyncio
680 async def test_SI35_file_scoped_history_lookup_under_50ms(
681 self, db_session: AsyncSession,
682 ) -> None:
683 from musehub.services.musehub_symbol_indexer import load_symbol_history
684 repo = await create_repo(db_session)
685 parent: list[str] = []
686 last_id = ""
687 for i in range(50):
688 cid = _uid()
689 await _commit_with_delta(db_session, repo.repo_id, cid, [
690 _insert_op(f"src/other_{i}.py::fn"),
691 _insert_op("src/target.py::hot_fn"),
692 ], parent_ids=parent)
693 parent = [cid]
694 last_id = cid
695 await _build_and_persist(db_session, repo.repo_id, last_id)
696
697 t0 = time.perf_counter()
698 history = await load_symbol_history(db_session, repo.repo_id, file_path="src/target.py")
699 elapsed_ms = (time.perf_counter() - t0) * 1000
700 assert "src/target.py::hot_fn" in history
701 assert elapsed_ms < 50, f"file-scoped lookup took {elapsed_ms:.1f}ms"
702
703 @pytest.mark.asyncio
704 async def test_SI36_load_symbol_history_no_file_filter_returns_all(
705 self, db_session: AsyncSession,
706 ) -> None:
707 from musehub.services.musehub_symbol_indexer import load_symbol_history
708 repo = await create_repo(db_session)
709 cid = _uid()
710 await _commit_with_delta(db_session, repo.repo_id, cid, [
711 _insert_op("src/a.py::fn_a"),
712 _insert_op("src/b.py::fn_b"),
713 _insert_op("src/c.py::fn_c"),
714 ])
715 await _build_and_persist(db_session, repo.repo_id, cid)
716 history = await load_symbol_history(db_session, repo.repo_id)
717 assert {"src/a.py::fn_a", "src/b.py::fn_b", "src/c.py::fn_c"} <= set(history.keys())
718
719
720 # ─────────────────────────────────────────────────────────────────────────────
721 # Layer 7 — Stress: 500 symbols × realistic commit volume
722 # ─────────────────────────────────────────────────────────────────────────────
723
724 class TestStress:
725 """SI37–SI38: large repos index without timeout or corruption."""
726
727 @pytest.mark.asyncio
728 async def test_SI37_index_500_symbols_across_10_commits(
729 self, db_session: AsyncSession,
730 ) -> None:
731 from musehub.db.musehub_intel_models import MusehubSymbolHistoryEntry
732 repo = await create_repo(db_session)
733 symbols = [f"src/module_{i // 10}.py::fn_{i}" for i in range(500)]
734 parent: list[str] = []
735 last_id = ""
736 chunk = len(symbols) // 10
737 for batch_idx in range(10):
738 cid = _uid()
739 ops = [_insert_op(s) for s in symbols[batch_idx * chunk:(batch_idx + 1) * chunk]]
740 await _commit_with_delta(db_session, repo.repo_id, cid, ops, parent_ids=parent)
741 parent = [cid]
742 last_id = cid
743
744 t0 = time.perf_counter()
745 await _build_and_persist(db_session, repo.repo_id, last_id)
746 elapsed = time.perf_counter() - t0
747
748 count = (await db_session.execute(
749 select(func.count()).select_from(MusehubSymbolHistoryEntry).where(
750 MusehubSymbolHistoryEntry.repo_id == repo.repo_id
751 )
752 )).scalar_one()
753 assert count == 500
754 assert elapsed < 10.0, f"500-symbol index took {elapsed:.1f}s"
755
756 @pytest.mark.asyncio
757 async def test_SI38_same_symbol_modified_50_times(
758 self, db_session: AsyncSession,
759 ) -> None:
760 from musehub.db.musehub_intel_models import MusehubSymbolHistoryEntry
761 from musehub.services.musehub_symbol_indexer import lookup_symbol_intel
762 repo = await create_repo(db_session)
763 addr = "src/hot.py::hot_fn"
764 parent: list[str] = []
765 last_id = ""
766 for i in range(50):
767 cid = _uid()
768 op_type = "insert" if i == 0 else "replace"
769 await _commit_with_delta(db_session, repo.repo_id, cid, [
770 {"address": addr, "op": op_type, "content_id": _cid()},
771 ], parent_ids=parent)
772 parent = [cid]
773 last_id = cid
774 await _build_and_persist(db_session, repo.repo_id, last_id)
775
776 rows = (await db_session.execute(
777 select(MusehubSymbolHistoryEntry).where(
778 MusehubSymbolHistoryEntry.repo_id == repo.repo_id,
779 MusehubSymbolHistoryEntry.address == addr,
780 )
781 )).scalars().all()
782 assert len(rows) == 50
783
784 intel = await lookup_symbol_intel(db_session, repo.repo_id, [addr])
785 assert intel[addr]["churn"] == 50
786
787
788 # ─────────────────────────────────────────────────────────────────────────────
789 # Layer 8 — Aggregates: intel_summary and intel_snapshot still produced
790 # ─────────────────────────────────────────────────────────────────────────────
791
792 class TestAggregatesStillWork:
793 """SI39–SI40: aggregate outputs (summary, snapshot) are unaffected."""
794
795 @pytest.mark.asyncio
796 async def test_SI39_intel_summary_fields_correct(
797 self, db_session: AsyncSession,
798 ) -> None:
799 from musehub.db.musehub_intel_models import MusehubIntelResult
800 repo = await create_repo(db_session)
801 cid = _uid()
802 await _commit_with_delta(db_session, repo.repo_id, cid, [
803 _insert_op("src/a.py::fn1"),
804 _insert_op("src/b.py::fn2"),
805 _insert_op("src/c.py::fn3"),
806 ])
807 await _build_and_persist(db_session, repo.repo_id, cid)
808 row = (await db_session.execute(
809 select(MusehubIntelResult).where(
810 MusehubIntelResult.repo_id == repo.repo_id,
811 MusehubIntelResult.intel_type == "code.intel_summary",
812 )
813 )).scalar_one()
814 data = json.loads(row.data_json)
815 assert data.get("symbol_count", 0) >= 3
816 assert "health_score" in data
817 assert "health_label" in data
818
819 @pytest.mark.asyncio
820 async def test_SI40_rebuild_updates_summary_symbol_count(
821 self, db_session: AsyncSession,
822 ) -> None:
823 from musehub.db.musehub_intel_models import MusehubIntelResult
824 repo = await create_repo(db_session)
825 c1, c2 = _uid(), _uid()
826 await _commit_with_delta(db_session, repo.repo_id, c1, [_insert_op("src/a.py::fn1")])
827 await _build_and_persist(db_session, repo.repo_id, c1)
828 await _commit_with_delta(db_session, repo.repo_id, c2, [
829 _insert_op("src/b.py::fn2"),
830 _insert_op("src/c.py::fn3"),
831 ], parent_ids=[c1])
832 await _build_and_persist(db_session, repo.repo_id, c2)
833
834 row = (await db_session.execute(
835 select(MusehubIntelResult).where(
836 MusehubIntelResult.repo_id == repo.repo_id,
837 MusehubIntelResult.intel_type == "code.intel_summary",
838 )
839 )).scalar_one()
840 data = json.loads(row.data_json)
841 assert data.get("symbol_count", 0) >= 3
File History 1 commit
sha256:0997d6250ae6476362f6fe2025af7789f46d03df3e9f34356d5e8ee79b201923 fix(issues): use issue number as pagination cursor, not cre… Sonnet 4.6 patch 7 days ago