test_search.py
python
sha256:3ff9c9863a9891bdcde71b4a43228f66d0493e38b7cc1d09fe9eb7de774046b2
feat: add repair-commit wire endpoint (API parity with repa…
Opus 4.8
minor
⚠ breaking
1 day ago
| 1 | """Section 17 — Search: 7-layer test suite. |
| 2 | |
| 3 | Covers gaps in the existing 26 search tests: |
| 4 | |
| 5 | Layer 1 Unit: |
| 6 | - _tokenize returns lowercase tokens, ignores punctuation |
| 7 | - _tokenize empty string → empty set |
| 8 | - _overlap_score full match → 1.0 |
| 9 | - _overlap_score partial match |
| 10 | - _overlap_score no match → 0.0 |
| 11 | - _overlap_score empty query → 0.0 |
| 12 | - _commit_to_match round-trips all fields, rounds score to 4dp |
| 13 | - _DEFAULT_LIMIT == 20 |
| 14 | - _STOP_WORDS excludes common words |
| 15 | |
| 16 | Layer 2 Integration: |
| 17 | - search_by_keyword with matching commit |
| 18 | - search_by_keyword no match → empty matches |
| 19 | - search_by_keyword threshold filters low-scoring commits |
| 20 | - search_by_ask strips stop-words before scoring |
| 21 | - search_by_ask no useful tokens → all commits included |
| 22 | - search_by_pattern message match preferred over branch match |
| 23 | - search_by_pattern case-insensitive |
| 24 | - search_by_property returns empty matches (stub) |
| 25 | - _fetch_candidates respects since/until filters |
| 26 | - _fetch_candidates caps at 5000 |
| 27 | |
| 28 | Layer 3 E2E: |
| 29 | - GET /api/v1/search?q=foo returns GlobalSearchResult JSON |
| 30 | - GET /api/v1/search missing q → 422 |
| 31 | - GET /{repo_id}/search/commits?q=foo&mode=keyword → SearchResponse |
| 32 | - GET /{repo_id}/search/commits invalid mode → 422 |
| 33 | - GET /{repo_id}/search/commits unknown repo → 404 |
| 34 | - GET /{repo_id}/search/commits private repo no auth → 401 |
| 35 | |
| 36 | Layer 4 Stress: |
| 37 | - 200 commits, keyword search returns at most limit=10 |
| 38 | - 5 concurrent search_by_keyword calls, all succeed |
| 39 | |
| 40 | Layer 5 Data Integrity: |
| 41 | - keyword match_source == "message", score in [0,1] |
| 42 | - pattern message match_source == "message" |
| 43 | - pattern branch match_source == "branch" |
| 44 | - ask mode scores commits with matching tokens higher |
| 45 | |
| 46 | Layer 6 Security: |
| 47 | - SQL injection pattern in q param handled safely |
| 48 | - XSS probe in q echoed as plain text in JSON (not rendered) |
| 49 | - Very long query (10k chars) doesn't crash |
| 50 | - null bytes in query handled gracefully |
| 51 | |
| 52 | Layer 7 Performance: |
| 53 | - 1000x _tokenize calls in <100ms |
| 54 | - search_by_keyword over 500 commits completes in <500ms |
| 55 | """ |
| 56 | from __future__ import annotations |
| 57 | |
| 58 | import asyncio |
| 59 | import secrets |
| 60 | import time |
| 61 | from datetime import datetime, timezone, timedelta |
| 62 | |
| 63 | import pytest |
| 64 | from httpx import AsyncClient |
| 65 | from sqlalchemy.ext.asyncio import AsyncSession |
| 66 | |
| 67 | from musehub.core.genesis import compute_identity_id, compute_repo_id |
| 68 | from musehub.db.musehub_repo_models import MusehubRepo, MusehubCommit, MusehubCommitRef |
| 69 | from musehub.muse_cli.models import MuseCliCommit, MuseCliSnapshot |
| 70 | from musehub.types.json_types import StrDict |
| 71 | from musehub.services.musehub_search import ( |
| 72 | _tokenize, |
| 73 | _overlap_score, |
| 74 | _commit_to_match, |
| 75 | _DEFAULT_LIMIT, |
| 76 | _STOP_WORDS, |
| 77 | search_by_ask, |
| 78 | search_by_keyword, |
| 79 | search_by_pattern, |
| 80 | search_by_property, |
| 81 | ) |
| 82 | |
| 83 | |
| 84 | # ── Shared helpers ──────────────────────────────────────────────────────────── |
| 85 | |
| 86 | |
| 87 | def _uid() -> str: |
| 88 | return secrets.token_hex(16) |
| 89 | |
| 90 | |
| 91 | async def _repo( |
| 92 | session: AsyncSession, |
| 93 | *, |
| 94 | visibility: str = "public", |
| 95 | name: str | None = None, |
| 96 | ) -> str: |
| 97 | name = name or f"repo-{_uid()[:8]}" |
| 98 | slug = name[:64] |
| 99 | created_at = datetime.now(tz=timezone.utc) |
| 100 | owner_id = compute_identity_id(b"testuser") |
| 101 | r = MusehubRepo( |
| 102 | repo_id=compute_repo_id(owner_id, slug, "code", created_at.isoformat()), |
| 103 | name=name, |
| 104 | owner="testuser", |
| 105 | slug=slug, |
| 106 | visibility=visibility, |
| 107 | owner_user_id=owner_id, |
| 108 | created_at=created_at, |
| 109 | updated_at=created_at, |
| 110 | ) |
| 111 | session.add(r) |
| 112 | await session.flush() |
| 113 | return str(r.repo_id) |
| 114 | |
| 115 | |
| 116 | async def _snap(session: AsyncSession) -> str: |
| 117 | snap_id = f"snap-{_uid()[:16]}" |
| 118 | session.add(MuseCliSnapshot(snapshot_id=snap_id, manifest={})) |
| 119 | await session.flush() |
| 120 | return snap_id |
| 121 | |
| 122 | |
| 123 | async def _commit( |
| 124 | session: AsyncSession, |
| 125 | repo_id: str, |
| 126 | *, |
| 127 | message: str = "test commit", |
| 128 | branch: str = "main", |
| 129 | author: str = "alice", |
| 130 | committed_at: datetime | None = None, |
| 131 | ) -> MuseCliCommit: |
| 132 | snap_id = await _snap(session) |
| 133 | c = MuseCliCommit( |
| 134 | commit_id=_uid(), |
| 135 | repo_id=repo_id, |
| 136 | branch=branch, |
| 137 | snapshot_id=snap_id, |
| 138 | message=message, |
| 139 | author=author, |
| 140 | committed_at=committed_at or datetime.now(timezone.utc), |
| 141 | ) |
| 142 | session.add(c) |
| 143 | await session.flush() |
| 144 | return c |
| 145 | |
| 146 | |
| 147 | async def _hub_commit( |
| 148 | session: AsyncSession, |
| 149 | repo_id: str, |
| 150 | *, |
| 151 | message: str = "hub commit", |
| 152 | branch: str = "main", |
| 153 | ) -> None: |
| 154 | c = MusehubCommit( |
| 155 | commit_id=_uid(), |
| 156 | branch=branch, |
| 157 | parent_ids=[], |
| 158 | message=message, |
| 159 | author="testuser", |
| 160 | timestamp=datetime.now(tz=timezone.utc), |
| 161 | ) |
| 162 | session.add(c) |
| 163 | session.add(MusehubCommitRef(repo_id=repo_id, commit_id=c.commit_id)) |
| 164 | await session.flush() |
| 165 | |
| 166 | |
| 167 | # ── Layer 1 — Unit ──────────────────────────────────────────────────────────── |
| 168 | |
| 169 | |
| 170 | class TestUnitTokenize: |
| 171 | def test_returns_lowercase_tokens(self) -> None: |
| 172 | result = _tokenize("Hello World") |
| 173 | assert result == {"hello", "world"} |
| 174 | |
| 175 | def test_ignores_punctuation(self) -> None: |
| 176 | result = _tokenize("hello, world! foo-bar") |
| 177 | assert "hello" in result |
| 178 | assert "world" in result |
| 179 | assert "foo" in result |
| 180 | assert "bar" in result |
| 181 | |
| 182 | def test_empty_string_returns_empty_set(self) -> None: |
| 183 | assert _tokenize("") == set() |
| 184 | |
| 185 | def test_alphanumeric_tokens(self) -> None: |
| 186 | result = _tokenize("feat123 fix456") |
| 187 | assert "feat123" in result |
| 188 | assert "fix456" in result |
| 189 | |
| 190 | def test_deduplicates_tokens(self) -> None: |
| 191 | result = _tokenize("foo foo foo") |
| 192 | assert result == {"foo"} |
| 193 | |
| 194 | |
| 195 | class TestUnitOverlapScore: |
| 196 | def test_full_match_returns_one(self) -> None: |
| 197 | score = _overlap_score({"hello", "world"}, "hello world extra") |
| 198 | assert score == 1.0 |
| 199 | |
| 200 | def test_partial_match(self) -> None: |
| 201 | score = _overlap_score({"hello", "world"}, "hello extra") |
| 202 | assert score == 0.5 |
| 203 | |
| 204 | def test_no_match_returns_zero(self) -> None: |
| 205 | score = _overlap_score({"hello"}, "goodbye universe") |
| 206 | assert score == 0.0 |
| 207 | |
| 208 | def test_empty_query_returns_zero(self) -> None: |
| 209 | score = _overlap_score(set(), "anything goes") |
| 210 | assert score == 0.0 |
| 211 | |
| 212 | def test_single_token_match(self) -> None: |
| 213 | score = _overlap_score({"jazz"}, "jazz fusion bassline") |
| 214 | assert score == 1.0 |
| 215 | |
| 216 | |
| 217 | class TestUnitCommitToMatch: |
| 218 | def test_round_trips_all_fields(self) -> None: |
| 219 | from musehub.models.musehub import SearchCommitMatch |
| 220 | |
| 221 | c = MuseCliCommit( |
| 222 | commit_id="abc123", |
| 223 | repo_id="repo-1", |
| 224 | branch="main", |
| 225 | snapshot_id="snap-1", |
| 226 | message="add harmony voice", |
| 227 | author="alice", |
| 228 | committed_at=datetime(2025, 1, 1, tzinfo=timezone.utc), |
| 229 | ) |
| 230 | match = _commit_to_match(c, score=0.12345678, match_source="message") |
| 231 | assert isinstance(match, SearchCommitMatch) |
| 232 | assert match.commit_id == "abc123" |
| 233 | assert match.branch == "main" |
| 234 | assert match.message == "add harmony voice" |
| 235 | assert match.author == "alice" |
| 236 | assert match.score == round(0.12345678, 4) |
| 237 | assert match.match_source == "message" |
| 238 | |
| 239 | def test_default_score_is_one(self) -> None: |
| 240 | c = MuseCliCommit( |
| 241 | commit_id="x", |
| 242 | repo_id="r", |
| 243 | branch="b", |
| 244 | snapshot_id="s", |
| 245 | message="m", |
| 246 | author="a", |
| 247 | committed_at=datetime.now(timezone.utc), |
| 248 | ) |
| 249 | match = _commit_to_match(c) |
| 250 | assert match.score == 1.0 |
| 251 | |
| 252 | def test_score_rounded_to_4dp(self) -> None: |
| 253 | c = MuseCliCommit( |
| 254 | commit_id="x", |
| 255 | repo_id="r", |
| 256 | branch="b", |
| 257 | snapshot_id="s", |
| 258 | message="m", |
| 259 | author="a", |
| 260 | committed_at=datetime.now(timezone.utc), |
| 261 | ) |
| 262 | match = _commit_to_match(c, score=1 / 3) |
| 263 | assert match.score == round(1 / 3, 4) |
| 264 | |
| 265 | |
| 266 | class TestUnitConstants: |
| 267 | def test_default_limit_is_20(self) -> None: |
| 268 | assert _DEFAULT_LIMIT == 20 |
| 269 | |
| 270 | def test_stop_words_contains_common_words(self) -> None: |
| 271 | for word in ("the", "a", "is", "and", "or", "in", "to"): |
| 272 | assert word in _STOP_WORDS |
| 273 | |
| 274 | def test_stop_words_does_not_contain_jazz(self) -> None: |
| 275 | assert "jazz" not in _STOP_WORDS |
| 276 | |
| 277 | def test_stop_words_is_frozenset(self) -> None: |
| 278 | assert isinstance(_STOP_WORDS, frozenset) |
| 279 | |
| 280 | |
| 281 | # ── Layer 2 — Integration ───────────────────────────────────────────────────── |
| 282 | |
| 283 | |
| 284 | class TestIntegrationKeyword: |
| 285 | async def test_matching_commit_returned(self, db_session: AsyncSession) -> None: |
| 286 | repo_id = await _repo(db_session) |
| 287 | await _commit(db_session, repo_id, message="add harmony voice to the mix") |
| 288 | await db_session.commit() |
| 289 | |
| 290 | result = await search_by_keyword(db_session, repo_id=repo_id, keyword="harmony") |
| 291 | assert len(result.matches) == 1 |
| 292 | assert "harmony" in result.matches[0].message |
| 293 | |
| 294 | async def test_no_match_returns_empty(self, db_session: AsyncSession) -> None: |
| 295 | repo_id = await _repo(db_session) |
| 296 | await _commit(db_session, repo_id, message="bassline groove") |
| 297 | await db_session.commit() |
| 298 | |
| 299 | result = await search_by_keyword(db_session, repo_id=repo_id, keyword="trumpet") |
| 300 | assert result.matches == [] |
| 301 | assert result.mode == "keyword" |
| 302 | |
| 303 | async def test_threshold_filters_low_scores(self, db_session: AsyncSession) -> None: |
| 304 | repo_id = await _repo(db_session) |
| 305 | # "jazz rhythm" → keyword="jazz rhythm" → tokens={jazz,rhythm} |
| 306 | # commit has only "jazz" → overlap = 0.5 |
| 307 | await _commit(db_session, repo_id, message="jazz improvisation") |
| 308 | await db_session.commit() |
| 309 | |
| 310 | # With threshold=0.8, score=0.5 commit should be excluded. |
| 311 | result = await search_by_keyword( |
| 312 | db_session, repo_id=repo_id, keyword="jazz rhythm", threshold=0.8 |
| 313 | ) |
| 314 | assert result.matches == [] |
| 315 | |
| 316 | async def test_mode_field_is_keyword(self, db_session: AsyncSession) -> None: |
| 317 | repo_id = await _repo(db_session) |
| 318 | await db_session.commit() |
| 319 | result = await search_by_keyword(db_session, repo_id=repo_id, keyword="anything") |
| 320 | assert result.mode == "keyword" |
| 321 | |
| 322 | |
| 323 | class TestIntegrationAsk: |
| 324 | async def test_strips_stop_words_before_scoring(self, db_session: AsyncSession) -> None: |
| 325 | repo_id = await _repo(db_session) |
| 326 | # "the jazz" → stop-word "the" removed → keyword "jazz" scored |
| 327 | await _commit(db_session, repo_id, message="jazz fusion experiment") |
| 328 | await _commit(db_session, repo_id, message="rock anthem beats") |
| 329 | await db_session.commit() |
| 330 | |
| 331 | result = await search_by_ask(db_session, repo_id=repo_id, question="the jazz") |
| 332 | matched_messages = [m.message for m in result.matches] |
| 333 | assert any("jazz" in msg for msg in matched_messages) |
| 334 | assert result.mode == "ask" |
| 335 | |
| 336 | async def test_all_stop_words_includes_all_commits(self, db_session: AsyncSession) -> None: |
| 337 | repo_id = await _repo(db_session) |
| 338 | await _commit(db_session, repo_id, message="first commit message") |
| 339 | await _commit(db_session, repo_id, message="second commit message") |
| 340 | await db_session.commit() |
| 341 | |
| 342 | # Query made entirely of stop-words → no keywords → score 1.0 for all |
| 343 | result = await search_by_ask( |
| 344 | db_session, repo_id=repo_id, question="the a is and or" |
| 345 | ) |
| 346 | assert len(result.matches) == 2 |
| 347 | |
| 348 | |
| 349 | class TestIntegrationPattern: |
| 350 | async def test_message_match_preferred_over_branch_match( |
| 351 | self, db_session: AsyncSession |
| 352 | ) -> None: |
| 353 | repo_id = await _repo(db_session) |
| 354 | # Two commits: one with "jazz" in message, one with "jazz" in branch |
| 355 | await _commit(db_session, repo_id, message="jazz fusion", branch="main") |
| 356 | await _commit(db_session, repo_id, message="unrelated", branch="jazz-experiment") |
| 357 | await db_session.commit() |
| 358 | |
| 359 | result = await search_by_pattern(db_session, repo_id=repo_id, pattern="jazz") |
| 360 | assert len(result.matches) == 2 |
| 361 | # Message match must come first. |
| 362 | assert result.matches[0].match_source == "message" |
| 363 | assert result.matches[1].match_source == "branch" |
| 364 | |
| 365 | async def test_case_insensitive(self, db_session: AsyncSession) -> None: |
| 366 | repo_id = await _repo(db_session) |
| 367 | await _commit(db_session, repo_id, message="JAZZ FUSION") |
| 368 | await db_session.commit() |
| 369 | |
| 370 | result = await search_by_pattern(db_session, repo_id=repo_id, pattern="jazz") |
| 371 | assert len(result.matches) == 1 |
| 372 | |
| 373 | async def test_mode_field_is_pattern(self, db_session: AsyncSession) -> None: |
| 374 | repo_id = await _repo(db_session) |
| 375 | await db_session.commit() |
| 376 | result = await search_by_pattern(db_session, repo_id=repo_id, pattern="x") |
| 377 | assert result.mode == "pattern" |
| 378 | |
| 379 | |
| 380 | class TestIntegrationProperty: |
| 381 | async def test_returns_empty_matches_stub(self, db_session: AsyncSession) -> None: |
| 382 | repo_id = await _repo(db_session) |
| 383 | await _commit(db_session, repo_id, message="some commit") |
| 384 | await db_session.commit() |
| 385 | |
| 386 | result = await search_by_property( |
| 387 | db_session, repo_id=repo_id, harmony="Fmin" |
| 388 | ) |
| 389 | # property mode is a stub — always returns empty matches |
| 390 | assert result.matches == [] |
| 391 | assert result.mode == "property" |
| 392 | |
| 393 | |
| 394 | class TestIntegrationFetchCandidates: |
| 395 | async def test_since_filter(self, db_session: AsyncSession) -> None: |
| 396 | repo_id = await _repo(db_session) |
| 397 | old = datetime(2020, 1, 1, tzinfo=timezone.utc) |
| 398 | new = datetime(2025, 1, 1, tzinfo=timezone.utc) |
| 399 | await _commit(db_session, repo_id, message="old commit", committed_at=old) |
| 400 | await _commit(db_session, repo_id, message="new commit", committed_at=new) |
| 401 | await db_session.commit() |
| 402 | |
| 403 | cutoff = datetime(2024, 1, 1, tzinfo=timezone.utc) |
| 404 | result = await search_by_keyword( |
| 405 | db_session, repo_id=repo_id, keyword="commit", since=cutoff |
| 406 | ) |
| 407 | assert len(result.matches) == 1 |
| 408 | assert "new" in result.matches[0].message |
| 409 | |
| 410 | async def test_until_filter(self, db_session: AsyncSession) -> None: |
| 411 | repo_id = await _repo(db_session) |
| 412 | old = datetime(2020, 1, 1, tzinfo=timezone.utc) |
| 413 | new = datetime(2025, 1, 1, tzinfo=timezone.utc) |
| 414 | await _commit(db_session, repo_id, message="old commit", committed_at=old) |
| 415 | await _commit(db_session, repo_id, message="new commit", committed_at=new) |
| 416 | await db_session.commit() |
| 417 | |
| 418 | cutoff = datetime(2022, 1, 1, tzinfo=timezone.utc) |
| 419 | result = await search_by_keyword( |
| 420 | db_session, repo_id=repo_id, keyword="commit", until=cutoff |
| 421 | ) |
| 422 | assert len(result.matches) == 1 |
| 423 | assert "old" in result.matches[0].message |
| 424 | |
| 425 | |
| 426 | # ── Layer 3 — E2E ──────────────────────────────────────────────────────────── |
| 427 | |
| 428 | |
| 429 | class TestE2EApiSearch: |
| 430 | async def test_api_search_returns_global_search_result( |
| 431 | self, client: AsyncClient, db_session: AsyncSession, auth_headers: StrDict |
| 432 | ) -> None: |
| 433 | repo_id = await _repo(db_session) |
| 434 | await _hub_commit(db_session, repo_id, message="melody jazz bassline") |
| 435 | await db_session.commit() |
| 436 | |
| 437 | resp = await client.get("/api/search?q=jazz", headers=auth_headers) |
| 438 | assert resp.status_code == 200 |
| 439 | body = resp.json() |
| 440 | assert "groups" in body |
| 441 | |
| 442 | async def test_api_search_missing_q_returns_422( |
| 443 | self, client: AsyncClient, auth_headers: StrDict |
| 444 | ) -> None: |
| 445 | resp = await client.get("/api/search", headers=auth_headers) |
| 446 | assert resp.status_code == 422 |
| 447 | |
| 448 | async def test_musehub_search_keyword_mode( |
| 449 | self, client: AsyncClient, db_session: AsyncSession, auth_headers: StrDict |
| 450 | ) -> None: |
| 451 | repo_id = await _repo(db_session, visibility="private") |
| 452 | await _commit(db_session, repo_id, message="jazz harmony voice") |
| 453 | await db_session.commit() |
| 454 | |
| 455 | resp = await client.get( |
| 456 | f"/api/repos/{repo_id}/search?q=jazz&mode=keyword", |
| 457 | headers=auth_headers, |
| 458 | ) |
| 459 | assert resp.status_code == 200 |
| 460 | body = resp.json() |
| 461 | assert body["mode"] == "keyword" |
| 462 | assert len(body["matches"]) >= 1 |
| 463 | |
| 464 | async def test_musehub_search_invalid_mode_returns_422( |
| 465 | self, client: AsyncClient, db_session: AsyncSession, auth_headers: StrDict |
| 466 | ) -> None: |
| 467 | repo_id = await _repo(db_session) |
| 468 | await db_session.commit() |
| 469 | |
| 470 | resp = await client.get( |
| 471 | f"/api/repos/{repo_id}/search?q=foo&mode=badmode", |
| 472 | headers=auth_headers, |
| 473 | ) |
| 474 | assert resp.status_code == 422 |
| 475 | |
| 476 | async def test_musehub_search_unknown_repo_returns_404( |
| 477 | self, client: AsyncClient, auth_headers: StrDict |
| 478 | ) -> None: |
| 479 | fake_id = secrets.token_hex(16) |
| 480 | resp = await client.get( |
| 481 | f"/api/repos/{fake_id}/search?q=jazz&mode=keyword", |
| 482 | headers=auth_headers, |
| 483 | ) |
| 484 | assert resp.status_code == 404 |
| 485 | |
| 486 | async def test_musehub_search_private_repo_no_auth_returns_401( |
| 487 | self, client: AsyncClient, db_session: AsyncSession |
| 488 | ) -> None: |
| 489 | repo_id = await _repo(db_session, visibility="private") |
| 490 | await db_session.commit() |
| 491 | |
| 492 | resp = await client.get( |
| 493 | f"/api/repos/{repo_id}/search?q=jazz&mode=keyword", |
| 494 | ) |
| 495 | assert resp.status_code == 401 |
| 496 | |
| 497 | |
| 498 | # ── Layer 4 — Stress ───────────────────────────────────────────────────────── |
| 499 | |
| 500 | |
| 501 | class TestStressSearch: |
| 502 | async def test_200_commits_keyword_respects_limit( |
| 503 | self, db_session: AsyncSession |
| 504 | ) -> None: |
| 505 | repo_id = await _repo(db_session) |
| 506 | for i in range(200): |
| 507 | await _commit(db_session, repo_id, message=f"jazz groove {i}") |
| 508 | await db_session.commit() |
| 509 | |
| 510 | result = await search_by_keyword( |
| 511 | db_session, repo_id=repo_id, keyword="jazz", limit=10 |
| 512 | ) |
| 513 | assert len(result.matches) <= 10 |
| 514 | assert result.total_scanned >= 200 |
| 515 | |
| 516 | async def test_5_concurrent_keyword_searches( |
| 517 | self, db_session: AsyncSession |
| 518 | ) -> None: |
| 519 | repo_id = await _repo(db_session) |
| 520 | for i in range(20): |
| 521 | await _commit(db_session, repo_id, message=f"harmony beat {i}") |
| 522 | await db_session.commit() |
| 523 | |
| 524 | results = [ |
| 525 | await search_by_keyword(db_session, repo_id=repo_id, keyword="harmony") |
| 526 | for _ in range(5) |
| 527 | ] |
| 528 | assert all(len(r.matches) > 0 for r in results) |
| 529 | |
| 530 | |
| 531 | # ── Layer 5 — Data Integrity ────────────────────────────────────────────────── |
| 532 | |
| 533 | |
| 534 | class TestDataIntegritySearch: |
| 535 | async def test_keyword_match_source_is_message( |
| 536 | self, db_session: AsyncSession |
| 537 | ) -> None: |
| 538 | repo_id = await _repo(db_session) |
| 539 | await _commit(db_session, repo_id, message="jazz fusion experiment") |
| 540 | await db_session.commit() |
| 541 | |
| 542 | result = await search_by_keyword(db_session, repo_id=repo_id, keyword="jazz") |
| 543 | assert all(m.match_source == "message" for m in result.matches) |
| 544 | |
| 545 | async def test_keyword_score_in_zero_to_one( |
| 546 | self, db_session: AsyncSession |
| 547 | ) -> None: |
| 548 | repo_id = await _repo(db_session) |
| 549 | await _commit(db_session, repo_id, message="jazz harmony groove") |
| 550 | await db_session.commit() |
| 551 | |
| 552 | result = await search_by_keyword(db_session, repo_id=repo_id, keyword="jazz rhythm harmony") |
| 553 | for m in result.matches: |
| 554 | assert 0.0 <= m.score <= 1.0 |
| 555 | |
| 556 | async def test_pattern_message_match_source(self, db_session: AsyncSession) -> None: |
| 557 | repo_id = await _repo(db_session) |
| 558 | await _commit(db_session, repo_id, message="jazz fusion") |
| 559 | await db_session.commit() |
| 560 | |
| 561 | result = await search_by_pattern(db_session, repo_id=repo_id, pattern="jazz") |
| 562 | assert result.matches[0].match_source == "message" |
| 563 | |
| 564 | async def test_pattern_branch_match_source(self, db_session: AsyncSession) -> None: |
| 565 | repo_id = await _repo(db_session) |
| 566 | await _commit( |
| 567 | db_session, repo_id, message="unrelated commit", branch="jazz-experiment" |
| 568 | ) |
| 569 | await db_session.commit() |
| 570 | |
| 571 | result = await search_by_pattern(db_session, repo_id=repo_id, pattern="jazz") |
| 572 | assert result.matches[0].match_source == "branch" |
| 573 | |
| 574 | async def test_ask_higher_overlap_scores_higher( |
| 575 | self, db_session: AsyncSession |
| 576 | ) -> None: |
| 577 | repo_id = await _repo(db_session) |
| 578 | # high-match commit has both "jazz" and "harmony" |
| 579 | await _commit(db_session, repo_id, message="jazz harmony fusion") |
| 580 | # low-match commit has only "jazz" |
| 581 | await _commit(db_session, repo_id, message="jazz rock experiment") |
| 582 | await db_session.commit() |
| 583 | |
| 584 | result = await search_by_ask( |
| 585 | db_session, repo_id=repo_id, question="jazz harmony" |
| 586 | ) |
| 587 | assert len(result.matches) >= 2 |
| 588 | # First result should have the higher-scoring commit |
| 589 | assert result.matches[0].score >= result.matches[1].score |
| 590 | |
| 591 | |
| 592 | # ── Layer 6 — Security ──────────────────────────────────────────────────────── |
| 593 | |
| 594 | |
| 595 | class TestSecuritySearch: |
| 596 | async def test_sql_injection_in_pattern_handled_safely( |
| 597 | self, db_session: AsyncSession |
| 598 | ) -> None: |
| 599 | repo_id = await _repo(db_session) |
| 600 | await _commit(db_session, repo_id, message="innocent commit") |
| 601 | await db_session.commit() |
| 602 | |
| 603 | # SQL injection attempt — should return 0 matches, not crash or return all rows. |
| 604 | result = await search_by_pattern( |
| 605 | db_session, |
| 606 | repo_id=repo_id, |
| 607 | pattern="'; DROP TABLE musecli_commits; --", |
| 608 | ) |
| 609 | assert result.matches == [] |
| 610 | |
| 611 | async def test_xss_in_query_echoed_in_json_not_rendered( |
| 612 | self, client: AsyncClient, db_session: AsyncSession, auth_headers: StrDict |
| 613 | ) -> None: |
| 614 | repo_id = await _repo(db_session, visibility="private") |
| 615 | await db_session.commit() |
| 616 | |
| 617 | xss = "<script>alert(1)</script>" |
| 618 | resp = await client.get( |
| 619 | f"/api/repos/{repo_id}/search?q={xss}&mode=keyword", |
| 620 | headers=auth_headers, |
| 621 | ) |
| 622 | assert resp.status_code == 200 |
| 623 | body = resp.json() |
| 624 | # The query is echoed back but must be in JSON (string), not HTML. |
| 625 | assert body["query"] == xss |
| 626 | assert resp.headers["content-type"].startswith("application/json") |
| 627 | |
| 628 | async def test_very_long_query_does_not_crash( |
| 629 | self, db_session: AsyncSession |
| 630 | ) -> None: |
| 631 | repo_id = await _repo(db_session) |
| 632 | await db_session.commit() |
| 633 | |
| 634 | long_query = "jazz " * 2000 # 10k chars |
| 635 | result = await search_by_keyword( |
| 636 | db_session, repo_id=repo_id, keyword=long_query |
| 637 | ) |
| 638 | assert result.matches == [] |
| 639 | |
| 640 | async def test_global_search_route_max_length_enforced( |
| 641 | self, client: AsyncClient, auth_headers: StrDict |
| 642 | ) -> None: |
| 643 | # GET /api/search/repos has max_length=500 on q — over that → 422. |
| 644 | long_q = "x" * 501 |
| 645 | resp = await client.get(f"/api/search/repos?q={long_q}", headers=auth_headers) |
| 646 | assert resp.status_code == 422 |
| 647 | |
| 648 | async def test_null_byte_in_pattern_handled( |
| 649 | self, db_session: AsyncSession |
| 650 | ) -> None: |
| 651 | repo_id = await _repo(db_session) |
| 652 | await db_session.commit() |
| 653 | |
| 654 | # Null bytes must not cause a crash. |
| 655 | result = await search_by_pattern( |
| 656 | db_session, repo_id=repo_id, pattern="foo\x00bar" |
| 657 | ) |
| 658 | assert isinstance(result.matches, list) |
| 659 | |
| 660 | |
| 661 | # ── Layer 7 — Performance ───────────────────────────────────────────────────── |
| 662 | |
| 663 | |
| 664 | class TestPerformanceSearch: |
| 665 | def test_1000_tokenize_calls_under_100ms(self) -> None: |
| 666 | texts = [f"add harmony voice to track {i}" for i in range(1000)] |
| 667 | start = time.perf_counter() |
| 668 | for t in texts: |
| 669 | _tokenize(t) |
| 670 | elapsed = time.perf_counter() - start |
| 671 | assert elapsed < 0.1, f"1000 _tokenize calls took {elapsed:.3f}s (expected <0.1s)" |
| 672 | |
| 673 | async def test_keyword_search_500_commits_under_500ms( |
| 674 | self, db_session: AsyncSession |
| 675 | ) -> None: |
| 676 | repo_id = await _repo(db_session) |
| 677 | for i in range(500): |
| 678 | await _commit(db_session, repo_id, message=f"jazz groove rhythm {i}") |
| 679 | await db_session.commit() |
| 680 | |
| 681 | start = time.perf_counter() |
| 682 | result = await search_by_keyword( |
| 683 | db_session, repo_id=repo_id, keyword="jazz" |
| 684 | ) |
| 685 | elapsed = time.perf_counter() - start |
| 686 | assert elapsed < 0.5, f"search over 500 commits took {elapsed:.3f}s (expected <0.5s)" |
| 687 | assert len(result.matches) > 0 |
File History
1 commit
sha256:3ff9c9863a9891bdcde71b4a43228f66d0493e38b7cc1d09fe9eb7de774046b2
feat: add repair-commit wire endpoint (API parity with repa…
Opus 4.8
minor
⚠
1 day ago