"""Section 17 — Search: 7-layer test suite. Covers gaps in the existing 26 search tests: Layer 1 Unit: - _tokenize returns lowercase tokens, ignores punctuation - _tokenize empty string → empty set - _overlap_score full match → 1.0 - _overlap_score partial match - _overlap_score no match → 0.0 - _overlap_score empty query → 0.0 - _commit_to_match round-trips all fields, rounds score to 4dp - _DEFAULT_LIMIT == 20 - _STOP_WORDS excludes common words Layer 2 Integration: - search_by_keyword with matching commit - search_by_keyword no match → empty matches - search_by_keyword threshold filters low-scoring commits - search_by_ask strips stop-words before scoring - search_by_ask no useful tokens → all commits included - search_by_pattern message match preferred over branch match - search_by_pattern case-insensitive - search_by_property returns empty matches (stub) - _fetch_candidates respects since/until filters - _fetch_candidates caps at 5000 Layer 3 E2E: - GET /api/v1/search?q=foo returns GlobalSearchResult JSON - GET /api/v1/search missing q → 422 - GET /{repo_id}/search/commits?q=foo&mode=keyword → SearchResponse - GET /{repo_id}/search/commits invalid mode → 422 - GET /{repo_id}/search/commits unknown repo → 404 - GET /{repo_id}/search/commits private repo no auth → 401 Layer 4 Stress: - 200 commits, keyword search returns at most limit=10 - 5 concurrent search_by_keyword calls, all succeed Layer 5 Data Integrity: - keyword match_source == "message", score in [0,1] - pattern message match_source == "message" - pattern branch match_source == "branch" - ask mode scores commits with matching tokens higher Layer 6 Security: - SQL injection pattern in q param handled safely - XSS probe in q echoed as plain text in JSON (not rendered) - Very long query (10k chars) doesn't crash - null bytes in query handled gracefully Layer 7 Performance: - 1000x _tokenize calls in <100ms - search_by_keyword over 500 commits completes in <500ms """ from __future__ import annotations import asyncio import secrets import time from datetime import datetime, timezone, timedelta import pytest from httpx import AsyncClient from sqlalchemy.ext.asyncio import AsyncSession from musehub.core.genesis import compute_identity_id, compute_repo_id from musehub.db.musehub_repo_models import MusehubRepo, MusehubCommit, MusehubCommitRef from musehub.muse_cli.models import MuseCliCommit, MuseCliSnapshot from musehub.types.json_types import StrDict from musehub.services.musehub_search import ( _tokenize, _overlap_score, _commit_to_match, _DEFAULT_LIMIT, _STOP_WORDS, search_by_ask, search_by_keyword, search_by_pattern, search_by_property, ) # ── Shared helpers ──────────────────────────────────────────────────────────── def _uid() -> str: return secrets.token_hex(16) async def _repo( session: AsyncSession, *, visibility: str = "public", name: str | None = None, ) -> str: name = name or f"repo-{_uid()[:8]}" slug = name[:64] created_at = datetime.now(tz=timezone.utc) owner_id = compute_identity_id(b"testuser") r = MusehubRepo( repo_id=compute_repo_id(owner_id, slug, "code", created_at.isoformat()), name=name, owner="testuser", slug=slug, visibility=visibility, owner_user_id=owner_id, created_at=created_at, updated_at=created_at, ) session.add(r) await session.flush() return str(r.repo_id) async def _snap(session: AsyncSession) -> str: snap_id = f"snap-{_uid()[:16]}" session.add(MuseCliSnapshot(snapshot_id=snap_id, manifest={})) await session.flush() return snap_id async def _commit( session: AsyncSession, repo_id: str, *, message: str = "test commit", branch: str = "main", author: str = "alice", committed_at: datetime | None = None, ) -> MuseCliCommit: snap_id = await _snap(session) c = MuseCliCommit( commit_id=_uid(), repo_id=repo_id, branch=branch, snapshot_id=snap_id, message=message, author=author, committed_at=committed_at or datetime.now(timezone.utc), ) session.add(c) await session.flush() return c async def _hub_commit( session: AsyncSession, repo_id: str, *, message: str = "hub commit", branch: str = "main", ) -> None: c = MusehubCommit( commit_id=_uid(), branch=branch, parent_ids=[], message=message, author="testuser", timestamp=datetime.now(tz=timezone.utc), ) session.add(c) session.add(MusehubCommitRef(repo_id=repo_id, commit_id=c.commit_id)) await session.flush() # ── Layer 1 — Unit ──────────────────────────────────────────────────────────── class TestUnitTokenize: def test_returns_lowercase_tokens(self) -> None: result = _tokenize("Hello World") assert result == {"hello", "world"} def test_ignores_punctuation(self) -> None: result = _tokenize("hello, world! foo-bar") assert "hello" in result assert "world" in result assert "foo" in result assert "bar" in result def test_empty_string_returns_empty_set(self) -> None: assert _tokenize("") == set() def test_alphanumeric_tokens(self) -> None: result = _tokenize("feat123 fix456") assert "feat123" in result assert "fix456" in result def test_deduplicates_tokens(self) -> None: result = _tokenize("foo foo foo") assert result == {"foo"} class TestUnitOverlapScore: def test_full_match_returns_one(self) -> None: score = _overlap_score({"hello", "world"}, "hello world extra") assert score == 1.0 def test_partial_match(self) -> None: score = _overlap_score({"hello", "world"}, "hello extra") assert score == 0.5 def test_no_match_returns_zero(self) -> None: score = _overlap_score({"hello"}, "goodbye universe") assert score == 0.0 def test_empty_query_returns_zero(self) -> None: score = _overlap_score(set(), "anything goes") assert score == 0.0 def test_single_token_match(self) -> None: score = _overlap_score({"jazz"}, "jazz fusion bassline") assert score == 1.0 class TestUnitCommitToMatch: def test_round_trips_all_fields(self) -> None: from musehub.models.musehub import SearchCommitMatch c = MuseCliCommit( commit_id="abc123", repo_id="repo-1", branch="main", snapshot_id="snap-1", message="add harmony voice", author="alice", committed_at=datetime(2025, 1, 1, tzinfo=timezone.utc), ) match = _commit_to_match(c, score=0.12345678, match_source="message") assert isinstance(match, SearchCommitMatch) assert match.commit_id == "abc123" assert match.branch == "main" assert match.message == "add harmony voice" assert match.author == "alice" assert match.score == round(0.12345678, 4) assert match.match_source == "message" def test_default_score_is_one(self) -> None: c = MuseCliCommit( commit_id="x", repo_id="r", branch="b", snapshot_id="s", message="m", author="a", committed_at=datetime.now(timezone.utc), ) match = _commit_to_match(c) assert match.score == 1.0 def test_score_rounded_to_4dp(self) -> None: c = MuseCliCommit( commit_id="x", repo_id="r", branch="b", snapshot_id="s", message="m", author="a", committed_at=datetime.now(timezone.utc), ) match = _commit_to_match(c, score=1 / 3) assert match.score == round(1 / 3, 4) class TestUnitConstants: def test_default_limit_is_20(self) -> None: assert _DEFAULT_LIMIT == 20 def test_stop_words_contains_common_words(self) -> None: for word in ("the", "a", "is", "and", "or", "in", "to"): assert word in _STOP_WORDS def test_stop_words_does_not_contain_jazz(self) -> None: assert "jazz" not in _STOP_WORDS def test_stop_words_is_frozenset(self) -> None: assert isinstance(_STOP_WORDS, frozenset) # ── Layer 2 — Integration ───────────────────────────────────────────────────── class TestIntegrationKeyword: async def test_matching_commit_returned(self, db_session: AsyncSession) -> None: repo_id = await _repo(db_session) await _commit(db_session, repo_id, message="add harmony voice to the mix") await db_session.commit() result = await search_by_keyword(db_session, repo_id=repo_id, keyword="harmony") assert len(result.matches) == 1 assert "harmony" in result.matches[0].message async def test_no_match_returns_empty(self, db_session: AsyncSession) -> None: repo_id = await _repo(db_session) await _commit(db_session, repo_id, message="bassline groove") await db_session.commit() result = await search_by_keyword(db_session, repo_id=repo_id, keyword="trumpet") assert result.matches == [] assert result.mode == "keyword" async def test_threshold_filters_low_scores(self, db_session: AsyncSession) -> None: repo_id = await _repo(db_session) # "jazz rhythm" → keyword="jazz rhythm" → tokens={jazz,rhythm} # commit has only "jazz" → overlap = 0.5 await _commit(db_session, repo_id, message="jazz improvisation") await db_session.commit() # With threshold=0.8, score=0.5 commit should be excluded. result = await search_by_keyword( db_session, repo_id=repo_id, keyword="jazz rhythm", threshold=0.8 ) assert result.matches == [] async def test_mode_field_is_keyword(self, db_session: AsyncSession) -> None: repo_id = await _repo(db_session) await db_session.commit() result = await search_by_keyword(db_session, repo_id=repo_id, keyword="anything") assert result.mode == "keyword" class TestIntegrationAsk: async def test_strips_stop_words_before_scoring(self, db_session: AsyncSession) -> None: repo_id = await _repo(db_session) # "the jazz" → stop-word "the" removed → keyword "jazz" scored await _commit(db_session, repo_id, message="jazz fusion experiment") await _commit(db_session, repo_id, message="rock anthem beats") await db_session.commit() result = await search_by_ask(db_session, repo_id=repo_id, question="the jazz") matched_messages = [m.message for m in result.matches] assert any("jazz" in msg for msg in matched_messages) assert result.mode == "ask" async def test_all_stop_words_includes_all_commits(self, db_session: AsyncSession) -> None: repo_id = await _repo(db_session) await _commit(db_session, repo_id, message="first commit message") await _commit(db_session, repo_id, message="second commit message") await db_session.commit() # Query made entirely of stop-words → no keywords → score 1.0 for all result = await search_by_ask( db_session, repo_id=repo_id, question="the a is and or" ) assert len(result.matches) == 2 class TestIntegrationPattern: async def test_message_match_preferred_over_branch_match( self, db_session: AsyncSession ) -> None: repo_id = await _repo(db_session) # Two commits: one with "jazz" in message, one with "jazz" in branch await _commit(db_session, repo_id, message="jazz fusion", branch="main") await _commit(db_session, repo_id, message="unrelated", branch="jazz-experiment") await db_session.commit() result = await search_by_pattern(db_session, repo_id=repo_id, pattern="jazz") assert len(result.matches) == 2 # Message match must come first. assert result.matches[0].match_source == "message" assert result.matches[1].match_source == "branch" async def test_case_insensitive(self, db_session: AsyncSession) -> None: repo_id = await _repo(db_session) await _commit(db_session, repo_id, message="JAZZ FUSION") await db_session.commit() result = await search_by_pattern(db_session, repo_id=repo_id, pattern="jazz") assert len(result.matches) == 1 async def test_mode_field_is_pattern(self, db_session: AsyncSession) -> None: repo_id = await _repo(db_session) await db_session.commit() result = await search_by_pattern(db_session, repo_id=repo_id, pattern="x") assert result.mode == "pattern" class TestIntegrationProperty: async def test_returns_empty_matches_stub(self, db_session: AsyncSession) -> None: repo_id = await _repo(db_session) await _commit(db_session, repo_id, message="some commit") await db_session.commit() result = await search_by_property( db_session, repo_id=repo_id, harmony="Fmin" ) # property mode is a stub — always returns empty matches assert result.matches == [] assert result.mode == "property" class TestIntegrationFetchCandidates: async def test_since_filter(self, db_session: AsyncSession) -> None: repo_id = await _repo(db_session) old = datetime(2020, 1, 1, tzinfo=timezone.utc) new = datetime(2025, 1, 1, tzinfo=timezone.utc) await _commit(db_session, repo_id, message="old commit", committed_at=old) await _commit(db_session, repo_id, message="new commit", committed_at=new) await db_session.commit() cutoff = datetime(2024, 1, 1, tzinfo=timezone.utc) result = await search_by_keyword( db_session, repo_id=repo_id, keyword="commit", since=cutoff ) assert len(result.matches) == 1 assert "new" in result.matches[0].message async def test_until_filter(self, db_session: AsyncSession) -> None: repo_id = await _repo(db_session) old = datetime(2020, 1, 1, tzinfo=timezone.utc) new = datetime(2025, 1, 1, tzinfo=timezone.utc) await _commit(db_session, repo_id, message="old commit", committed_at=old) await _commit(db_session, repo_id, message="new commit", committed_at=new) await db_session.commit() cutoff = datetime(2022, 1, 1, tzinfo=timezone.utc) result = await search_by_keyword( db_session, repo_id=repo_id, keyword="commit", until=cutoff ) assert len(result.matches) == 1 assert "old" in result.matches[0].message # ── Layer 3 — E2E ──────────────────────────────────────────────────────────── class TestE2EApiSearch: async def test_api_search_returns_global_search_result( self, client: AsyncClient, db_session: AsyncSession, auth_headers: StrDict ) -> None: repo_id = await _repo(db_session) await _hub_commit(db_session, repo_id, message="melody jazz bassline") await db_session.commit() resp = await client.get("/api/search?q=jazz", headers=auth_headers) assert resp.status_code == 200 body = resp.json() assert "groups" in body async def test_api_search_missing_q_returns_422( self, client: AsyncClient, auth_headers: StrDict ) -> None: resp = await client.get("/api/search", headers=auth_headers) assert resp.status_code == 422 async def test_musehub_search_keyword_mode( self, client: AsyncClient, db_session: AsyncSession, auth_headers: StrDict ) -> None: repo_id = await _repo(db_session, visibility="private") await _commit(db_session, repo_id, message="jazz harmony voice") await db_session.commit() resp = await client.get( f"/api/repos/{repo_id}/search?q=jazz&mode=keyword", headers=auth_headers, ) assert resp.status_code == 200 body = resp.json() assert body["mode"] == "keyword" assert len(body["matches"]) >= 1 async def test_musehub_search_invalid_mode_returns_422( self, client: AsyncClient, db_session: AsyncSession, auth_headers: StrDict ) -> None: repo_id = await _repo(db_session) await db_session.commit() resp = await client.get( f"/api/repos/{repo_id}/search?q=foo&mode=badmode", headers=auth_headers, ) assert resp.status_code == 422 async def test_musehub_search_unknown_repo_returns_404( self, client: AsyncClient, auth_headers: StrDict ) -> None: fake_id = secrets.token_hex(16) resp = await client.get( f"/api/repos/{fake_id}/search?q=jazz&mode=keyword", headers=auth_headers, ) assert resp.status_code == 404 async def test_musehub_search_private_repo_no_auth_returns_401( self, client: AsyncClient, db_session: AsyncSession ) -> None: repo_id = await _repo(db_session, visibility="private") await db_session.commit() resp = await client.get( f"/api/repos/{repo_id}/search?q=jazz&mode=keyword", ) assert resp.status_code == 401 # ── Layer 4 — Stress ───────────────────────────────────────────────────────── class TestStressSearch: async def test_200_commits_keyword_respects_limit( self, db_session: AsyncSession ) -> None: repo_id = await _repo(db_session) for i in range(200): await _commit(db_session, repo_id, message=f"jazz groove {i}") await db_session.commit() result = await search_by_keyword( db_session, repo_id=repo_id, keyword="jazz", limit=10 ) assert len(result.matches) <= 10 assert result.total_scanned >= 200 async def test_5_concurrent_keyword_searches( self, db_session: AsyncSession ) -> None: repo_id = await _repo(db_session) for i in range(20): await _commit(db_session, repo_id, message=f"harmony beat {i}") await db_session.commit() results = [ await search_by_keyword(db_session, repo_id=repo_id, keyword="harmony") for _ in range(5) ] assert all(len(r.matches) > 0 for r in results) # ── Layer 5 — Data Integrity ────────────────────────────────────────────────── class TestDataIntegritySearch: async def test_keyword_match_source_is_message( self, db_session: AsyncSession ) -> None: repo_id = await _repo(db_session) await _commit(db_session, repo_id, message="jazz fusion experiment") await db_session.commit() result = await search_by_keyword(db_session, repo_id=repo_id, keyword="jazz") assert all(m.match_source == "message" for m in result.matches) async def test_keyword_score_in_zero_to_one( self, db_session: AsyncSession ) -> None: repo_id = await _repo(db_session) await _commit(db_session, repo_id, message="jazz harmony groove") await db_session.commit() result = await search_by_keyword(db_session, repo_id=repo_id, keyword="jazz rhythm harmony") for m in result.matches: assert 0.0 <= m.score <= 1.0 async def test_pattern_message_match_source(self, db_session: AsyncSession) -> None: repo_id = await _repo(db_session) await _commit(db_session, repo_id, message="jazz fusion") await db_session.commit() result = await search_by_pattern(db_session, repo_id=repo_id, pattern="jazz") assert result.matches[0].match_source == "message" async def test_pattern_branch_match_source(self, db_session: AsyncSession) -> None: repo_id = await _repo(db_session) await _commit( db_session, repo_id, message="unrelated commit", branch="jazz-experiment" ) await db_session.commit() result = await search_by_pattern(db_session, repo_id=repo_id, pattern="jazz") assert result.matches[0].match_source == "branch" async def test_ask_higher_overlap_scores_higher( self, db_session: AsyncSession ) -> None: repo_id = await _repo(db_session) # high-match commit has both "jazz" and "harmony" await _commit(db_session, repo_id, message="jazz harmony fusion") # low-match commit has only "jazz" await _commit(db_session, repo_id, message="jazz rock experiment") await db_session.commit() result = await search_by_ask( db_session, repo_id=repo_id, question="jazz harmony" ) assert len(result.matches) >= 2 # First result should have the higher-scoring commit assert result.matches[0].score >= result.matches[1].score # ── Layer 6 — Security ──────────────────────────────────────────────────────── class TestSecuritySearch: async def test_sql_injection_in_pattern_handled_safely( self, db_session: AsyncSession ) -> None: repo_id = await _repo(db_session) await _commit(db_session, repo_id, message="innocent commit") await db_session.commit() # SQL injection attempt — should return 0 matches, not crash or return all rows. result = await search_by_pattern( db_session, repo_id=repo_id, pattern="'; DROP TABLE musecli_commits; --", ) assert result.matches == [] async def test_xss_in_query_echoed_in_json_not_rendered( self, client: AsyncClient, db_session: AsyncSession, auth_headers: StrDict ) -> None: repo_id = await _repo(db_session, visibility="private") await db_session.commit() xss = "" resp = await client.get( f"/api/repos/{repo_id}/search?q={xss}&mode=keyword", headers=auth_headers, ) assert resp.status_code == 200 body = resp.json() # The query is echoed back but must be in JSON (string), not HTML. assert body["query"] == xss assert resp.headers["content-type"].startswith("application/json") async def test_very_long_query_does_not_crash( self, db_session: AsyncSession ) -> None: repo_id = await _repo(db_session) await db_session.commit() long_query = "jazz " * 2000 # 10k chars result = await search_by_keyword( db_session, repo_id=repo_id, keyword=long_query ) assert result.matches == [] async def test_global_search_route_max_length_enforced( self, client: AsyncClient, auth_headers: StrDict ) -> None: # GET /api/search/repos has max_length=500 on q — over that → 422. long_q = "x" * 501 resp = await client.get(f"/api/search/repos?q={long_q}", headers=auth_headers) assert resp.status_code == 422 async def test_null_byte_in_pattern_handled( self, db_session: AsyncSession ) -> None: repo_id = await _repo(db_session) await db_session.commit() # Null bytes must not cause a crash. result = await search_by_pattern( db_session, repo_id=repo_id, pattern="foo\x00bar" ) assert isinstance(result.matches, list) # ── Layer 7 — Performance ───────────────────────────────────────────────────── class TestPerformanceSearch: def test_1000_tokenize_calls_under_100ms(self) -> None: texts = [f"add harmony voice to track {i}" for i in range(1000)] start = time.perf_counter() for t in texts: _tokenize(t) elapsed = time.perf_counter() - start assert elapsed < 0.1, f"1000 _tokenize calls took {elapsed:.3f}s (expected <0.1s)" async def test_keyword_search_500_commits_under_500ms( self, db_session: AsyncSession ) -> None: repo_id = await _repo(db_session) for i in range(500): await _commit(db_session, repo_id, message=f"jazz groove rhythm {i}") await db_session.commit() start = time.perf_counter() result = await search_by_keyword( db_session, repo_id=repo_id, keyword="jazz" ) elapsed = time.perf_counter() - start assert elapsed < 0.5, f"search over 500 commits took {elapsed:.3f}s (expected <0.5s)" assert len(result.matches) > 0