"""Section 17 — Search: 7-layer test suite.

Covers gaps in the existing 26 search tests:

Layer 1 Unit:
  - _tokenize returns lowercase tokens, ignores punctuation
  - _tokenize empty string → empty set
  - _overlap_score full match → 1.0
  - _overlap_score partial match
  - _overlap_score no match → 0.0
  - _overlap_score empty query → 0.0
  - _commit_to_match round-trips all fields, rounds score to 4dp
  - _DEFAULT_LIMIT == 20
  - _STOP_WORDS excludes common words

Layer 2 Integration:
  - search_by_keyword with matching commit
  - search_by_keyword no match → empty matches
  - search_by_keyword threshold filters low-scoring commits
  - search_by_ask strips stop-words before scoring
  - search_by_ask no useful tokens → all commits included
  - search_by_pattern message match preferred over branch match
  - search_by_pattern case-insensitive
  - search_by_property returns empty matches (stub)
  - _fetch_candidates respects since/until filters
  - _fetch_candidates caps at 5000

Layer 3 E2E:
  - GET /api/v1/search?q=foo returns GlobalSearchResult JSON
  - GET /api/v1/search missing q → 422
  - GET /{repo_id}/search/commits?q=foo&mode=keyword → SearchResponse
  - GET /{repo_id}/search/commits invalid mode → 422
  - GET /{repo_id}/search/commits unknown repo → 404
  - GET /{repo_id}/search/commits private repo no auth → 401

Layer 4 Stress:
  - 200 commits, keyword search returns at most limit=10
  - 5 concurrent search_by_keyword calls, all succeed

Layer 5 Data Integrity:
  - keyword match_source == "message", score in [0,1]
  - pattern message match_source == "message"
  - pattern branch match_source == "branch"
  - ask mode scores commits with matching tokens higher

Layer 6 Security:
  - SQL injection pattern in q param handled safely
  - XSS probe in q echoed as plain text in JSON (not rendered)
  - Very long query (10k chars) doesn't crash
  - null bytes in query handled gracefully

Layer 7 Performance:
  - 1000x _tokenize calls in <100ms
  - search_by_keyword over 500 commits completes in <500ms
"""
from __future__ import annotations

import asyncio
import secrets
import time
from datetime import datetime, timezone, timedelta

import pytest
from httpx import AsyncClient
from sqlalchemy.ext.asyncio import AsyncSession

from musehub.core.genesis import compute_identity_id, compute_repo_id
from musehub.db.musehub_repo_models import MusehubRepo, MusehubCommit, MusehubCommitRef
from musehub.muse_cli.models import MuseCliCommit, MuseCliSnapshot
from musehub.types.json_types import StrDict
from musehub.services.musehub_search import (
    _tokenize,
    _overlap_score,
    _commit_to_match,
    _DEFAULT_LIMIT,
    _STOP_WORDS,
    search_by_ask,
    search_by_keyword,
    search_by_pattern,
    search_by_property,
)


# ── Shared helpers ────────────────────────────────────────────────────────────


def _uid() -> str:
    return secrets.token_hex(16)


async def _repo(
    session: AsyncSession,
    *,
    visibility: str = "public",
    name: str | None = None,
) -> str:
    name = name or f"repo-{_uid()[:8]}"
    slug = name[:64]
    created_at = datetime.now(tz=timezone.utc)
    owner_id = compute_identity_id(b"testuser")
    r = MusehubRepo(
        repo_id=compute_repo_id(owner_id, slug, "code", created_at.isoformat()),
        name=name,
        owner="testuser",
        slug=slug,
        visibility=visibility,
        owner_user_id=owner_id,
        created_at=created_at,
        updated_at=created_at,
    )
    session.add(r)
    await session.flush()
    return str(r.repo_id)


async def _snap(session: AsyncSession) -> str:
    snap_id = f"snap-{_uid()[:16]}"
    session.add(MuseCliSnapshot(snapshot_id=snap_id, manifest={}))
    await session.flush()
    return snap_id


async def _commit(
    session: AsyncSession,
    repo_id: str,
    *,
    message: str = "test commit",
    branch: str = "main",
    author: str = "alice",
    committed_at: datetime | None = None,
) -> MuseCliCommit:
    snap_id = await _snap(session)
    c = MuseCliCommit(
        commit_id=_uid(),
        repo_id=repo_id,
        branch=branch,
        snapshot_id=snap_id,
        message=message,
        author=author,
        committed_at=committed_at or datetime.now(timezone.utc),
    )
    session.add(c)
    await session.flush()
    return c


async def _hub_commit(
    session: AsyncSession,
    repo_id: str,
    *,
    message: str = "hub commit",
    branch: str = "main",
) -> None:
    c = MusehubCommit(
        commit_id=_uid(),
        branch=branch,
        parent_ids=[],
        message=message,
        author="testuser",
        timestamp=datetime.now(tz=timezone.utc),
    )
    session.add(c)
    session.add(MusehubCommitRef(repo_id=repo_id, commit_id=c.commit_id))
    await session.flush()


# ── Layer 1 — Unit ────────────────────────────────────────────────────────────


class TestUnitTokenize:
    def test_returns_lowercase_tokens(self) -> None:
        result = _tokenize("Hello World")
        assert result == {"hello", "world"}

    def test_ignores_punctuation(self) -> None:
        result = _tokenize("hello, world! foo-bar")
        assert "hello" in result
        assert "world" in result
        assert "foo" in result
        assert "bar" in result

    def test_empty_string_returns_empty_set(self) -> None:
        assert _tokenize("") == set()

    def test_alphanumeric_tokens(self) -> None:
        result = _tokenize("feat123 fix456")
        assert "feat123" in result
        assert "fix456" in result

    def test_deduplicates_tokens(self) -> None:
        result = _tokenize("foo foo foo")
        assert result == {"foo"}


class TestUnitOverlapScore:
    def test_full_match_returns_one(self) -> None:
        score = _overlap_score({"hello", "world"}, "hello world extra")
        assert score == 1.0

    def test_partial_match(self) -> None:
        score = _overlap_score({"hello", "world"}, "hello extra")
        assert score == 0.5

    def test_no_match_returns_zero(self) -> None:
        score = _overlap_score({"hello"}, "goodbye universe")
        assert score == 0.0

    def test_empty_query_returns_zero(self) -> None:
        score = _overlap_score(set(), "anything goes")
        assert score == 0.0

    def test_single_token_match(self) -> None:
        score = _overlap_score({"jazz"}, "jazz fusion bassline")
        assert score == 1.0


class TestUnitCommitToMatch:
    def test_round_trips_all_fields(self) -> None:
        from musehub.models.musehub import SearchCommitMatch

        c = MuseCliCommit(
            commit_id="abc123",
            repo_id="repo-1",
            branch="main",
            snapshot_id="snap-1",
            message="add harmony voice",
            author="alice",
            committed_at=datetime(2025, 1, 1, tzinfo=timezone.utc),
        )
        match = _commit_to_match(c, score=0.12345678, match_source="message")
        assert isinstance(match, SearchCommitMatch)
        assert match.commit_id == "abc123"
        assert match.branch == "main"
        assert match.message == "add harmony voice"
        assert match.author == "alice"
        assert match.score == round(0.12345678, 4)
        assert match.match_source == "message"

    def test_default_score_is_one(self) -> None:
        c = MuseCliCommit(
            commit_id="x",
            repo_id="r",
            branch="b",
            snapshot_id="s",
            message="m",
            author="a",
            committed_at=datetime.now(timezone.utc),
        )
        match = _commit_to_match(c)
        assert match.score == 1.0

    def test_score_rounded_to_4dp(self) -> None:
        c = MuseCliCommit(
            commit_id="x",
            repo_id="r",
            branch="b",
            snapshot_id="s",
            message="m",
            author="a",
            committed_at=datetime.now(timezone.utc),
        )
        match = _commit_to_match(c, score=1 / 3)
        assert match.score == round(1 / 3, 4)


class TestUnitConstants:
    def test_default_limit_is_20(self) -> None:
        assert _DEFAULT_LIMIT == 20

    def test_stop_words_contains_common_words(self) -> None:
        for word in ("the", "a", "is", "and", "or", "in", "to"):
            assert word in _STOP_WORDS

    def test_stop_words_does_not_contain_jazz(self) -> None:
        assert "jazz" not in _STOP_WORDS

    def test_stop_words_is_frozenset(self) -> None:
        assert isinstance(_STOP_WORDS, frozenset)


# ── Layer 2 — Integration ─────────────────────────────────────────────────────


class TestIntegrationKeyword:
    async def test_matching_commit_returned(self, db_session: AsyncSession) -> None:
        repo_id = await _repo(db_session)
        await _commit(db_session, repo_id, message="add harmony voice to the mix")
        await db_session.commit()

        result = await search_by_keyword(db_session, repo_id=repo_id, keyword="harmony")
        assert len(result.matches) == 1
        assert "harmony" in result.matches[0].message

    async def test_no_match_returns_empty(self, db_session: AsyncSession) -> None:
        repo_id = await _repo(db_session)
        await _commit(db_session, repo_id, message="bassline groove")
        await db_session.commit()

        result = await search_by_keyword(db_session, repo_id=repo_id, keyword="trumpet")
        assert result.matches == []
        assert result.mode == "keyword"

    async def test_threshold_filters_low_scores(self, db_session: AsyncSession) -> None:
        repo_id = await _repo(db_session)
        # "jazz rhythm" → keyword="jazz rhythm" → tokens={jazz,rhythm}
        # commit has only "jazz" → overlap = 0.5
        await _commit(db_session, repo_id, message="jazz improvisation")
        await db_session.commit()

        # With threshold=0.8, score=0.5 commit should be excluded.
        result = await search_by_keyword(
            db_session, repo_id=repo_id, keyword="jazz rhythm", threshold=0.8
        )
        assert result.matches == []

    async def test_mode_field_is_keyword(self, db_session: AsyncSession) -> None:
        repo_id = await _repo(db_session)
        await db_session.commit()
        result = await search_by_keyword(db_session, repo_id=repo_id, keyword="anything")
        assert result.mode == "keyword"


class TestIntegrationAsk:
    async def test_strips_stop_words_before_scoring(self, db_session: AsyncSession) -> None:
        repo_id = await _repo(db_session)
        # "the jazz" → stop-word "the" removed → keyword "jazz" scored
        await _commit(db_session, repo_id, message="jazz fusion experiment")
        await _commit(db_session, repo_id, message="rock anthem beats")
        await db_session.commit()

        result = await search_by_ask(db_session, repo_id=repo_id, question="the jazz")
        matched_messages = [m.message for m in result.matches]
        assert any("jazz" in msg for msg in matched_messages)
        assert result.mode == "ask"

    async def test_all_stop_words_includes_all_commits(self, db_session: AsyncSession) -> None:
        repo_id = await _repo(db_session)
        await _commit(db_session, repo_id, message="first commit message")
        await _commit(db_session, repo_id, message="second commit message")
        await db_session.commit()

        # Query made entirely of stop-words → no keywords → score 1.0 for all
        result = await search_by_ask(
            db_session, repo_id=repo_id, question="the a is and or"
        )
        assert len(result.matches) == 2


class TestIntegrationPattern:
    async def test_message_match_preferred_over_branch_match(
        self, db_session: AsyncSession
    ) -> None:
        repo_id = await _repo(db_session)
        # Two commits: one with "jazz" in message, one with "jazz" in branch
        await _commit(db_session, repo_id, message="jazz fusion", branch="main")
        await _commit(db_session, repo_id, message="unrelated", branch="jazz-experiment")
        await db_session.commit()

        result = await search_by_pattern(db_session, repo_id=repo_id, pattern="jazz")
        assert len(result.matches) == 2
        # Message match must come first.
        assert result.matches[0].match_source == "message"
        assert result.matches[1].match_source == "branch"

    async def test_case_insensitive(self, db_session: AsyncSession) -> None:
        repo_id = await _repo(db_session)
        await _commit(db_session, repo_id, message="JAZZ FUSION")
        await db_session.commit()

        result = await search_by_pattern(db_session, repo_id=repo_id, pattern="jazz")
        assert len(result.matches) == 1

    async def test_mode_field_is_pattern(self, db_session: AsyncSession) -> None:
        repo_id = await _repo(db_session)
        await db_session.commit()
        result = await search_by_pattern(db_session, repo_id=repo_id, pattern="x")
        assert result.mode == "pattern"


class TestIntegrationProperty:
    async def test_returns_empty_matches_stub(self, db_session: AsyncSession) -> None:
        repo_id = await _repo(db_session)
        await _commit(db_session, repo_id, message="some commit")
        await db_session.commit()

        result = await search_by_property(
            db_session, repo_id=repo_id, harmony="Fmin"
        )
        # property mode is a stub — always returns empty matches
        assert result.matches == []
        assert result.mode == "property"


class TestIntegrationFetchCandidates:
    async def test_since_filter(self, db_session: AsyncSession) -> None:
        repo_id = await _repo(db_session)
        old = datetime(2020, 1, 1, tzinfo=timezone.utc)
        new = datetime(2025, 1, 1, tzinfo=timezone.utc)
        await _commit(db_session, repo_id, message="old commit", committed_at=old)
        await _commit(db_session, repo_id, message="new commit", committed_at=new)
        await db_session.commit()

        cutoff = datetime(2024, 1, 1, tzinfo=timezone.utc)
        result = await search_by_keyword(
            db_session, repo_id=repo_id, keyword="commit", since=cutoff
        )
        assert len(result.matches) == 1
        assert "new" in result.matches[0].message

    async def test_until_filter(self, db_session: AsyncSession) -> None:
        repo_id = await _repo(db_session)
        old = datetime(2020, 1, 1, tzinfo=timezone.utc)
        new = datetime(2025, 1, 1, tzinfo=timezone.utc)
        await _commit(db_session, repo_id, message="old commit", committed_at=old)
        await _commit(db_session, repo_id, message="new commit", committed_at=new)
        await db_session.commit()

        cutoff = datetime(2022, 1, 1, tzinfo=timezone.utc)
        result = await search_by_keyword(
            db_session, repo_id=repo_id, keyword="commit", until=cutoff
        )
        assert len(result.matches) == 1
        assert "old" in result.matches[0].message


# ── Layer 3 — E2E ────────────────────────────────────────────────────────────


class TestE2EApiSearch:
    async def test_api_search_returns_global_search_result(
        self, client: AsyncClient, db_session: AsyncSession, auth_headers: StrDict
    ) -> None:
        repo_id = await _repo(db_session)
        await _hub_commit(db_session, repo_id, message="melody jazz bassline")
        await db_session.commit()

        resp = await client.get("/api/search?q=jazz", headers=auth_headers)
        assert resp.status_code == 200
        body = resp.json()
        assert "groups" in body

    async def test_api_search_missing_q_returns_422(
        self, client: AsyncClient, auth_headers: StrDict
    ) -> None:
        resp = await client.get("/api/search", headers=auth_headers)
        assert resp.status_code == 422

    async def test_musehub_search_keyword_mode(
        self, client: AsyncClient, db_session: AsyncSession, auth_headers: StrDict
    ) -> None:
        repo_id = await _repo(db_session, visibility="private")
        await _commit(db_session, repo_id, message="jazz harmony voice")
        await db_session.commit()

        resp = await client.get(
            f"/api/repos/{repo_id}/search?q=jazz&mode=keyword",
            headers=auth_headers,
        )
        assert resp.status_code == 200
        body = resp.json()
        assert body["mode"] == "keyword"
        assert len(body["matches"]) >= 1

    async def test_musehub_search_invalid_mode_returns_422(
        self, client: AsyncClient, db_session: AsyncSession, auth_headers: StrDict
    ) -> None:
        repo_id = await _repo(db_session)
        await db_session.commit()

        resp = await client.get(
            f"/api/repos/{repo_id}/search?q=foo&mode=badmode",
            headers=auth_headers,
        )
        assert resp.status_code == 422

    async def test_musehub_search_unknown_repo_returns_404(
        self, client: AsyncClient, auth_headers: StrDict
    ) -> None:
        fake_id = secrets.token_hex(16)
        resp = await client.get(
            f"/api/repos/{fake_id}/search?q=jazz&mode=keyword",
            headers=auth_headers,
        )
        assert resp.status_code == 404

    async def test_musehub_search_private_repo_no_auth_returns_401(
        self, client: AsyncClient, db_session: AsyncSession
    ) -> None:
        repo_id = await _repo(db_session, visibility="private")
        await db_session.commit()

        resp = await client.get(
            f"/api/repos/{repo_id}/search?q=jazz&mode=keyword",
        )
        assert resp.status_code == 401


# ── Layer 4 — Stress ─────────────────────────────────────────────────────────


class TestStressSearch:
    async def test_200_commits_keyword_respects_limit(
        self, db_session: AsyncSession
    ) -> None:
        repo_id = await _repo(db_session)
        for i in range(200):
            await _commit(db_session, repo_id, message=f"jazz groove {i}")
        await db_session.commit()

        result = await search_by_keyword(
            db_session, repo_id=repo_id, keyword="jazz", limit=10
        )
        assert len(result.matches) <= 10
        assert result.total_scanned >= 200

    async def test_5_concurrent_keyword_searches(
        self, db_session: AsyncSession
    ) -> None:
        repo_id = await _repo(db_session)
        for i in range(20):
            await _commit(db_session, repo_id, message=f"harmony beat {i}")
        await db_session.commit()

        results = [
            await search_by_keyword(db_session, repo_id=repo_id, keyword="harmony")
            for _ in range(5)
        ]
        assert all(len(r.matches) > 0 for r in results)


# ── Layer 5 — Data Integrity ──────────────────────────────────────────────────


class TestDataIntegritySearch:
    async def test_keyword_match_source_is_message(
        self, db_session: AsyncSession
    ) -> None:
        repo_id = await _repo(db_session)
        await _commit(db_session, repo_id, message="jazz fusion experiment")
        await db_session.commit()

        result = await search_by_keyword(db_session, repo_id=repo_id, keyword="jazz")
        assert all(m.match_source == "message" for m in result.matches)

    async def test_keyword_score_in_zero_to_one(
        self, db_session: AsyncSession
    ) -> None:
        repo_id = await _repo(db_session)
        await _commit(db_session, repo_id, message="jazz harmony groove")
        await db_session.commit()

        result = await search_by_keyword(db_session, repo_id=repo_id, keyword="jazz rhythm harmony")
        for m in result.matches:
            assert 0.0 <= m.score <= 1.0

    async def test_pattern_message_match_source(self, db_session: AsyncSession) -> None:
        repo_id = await _repo(db_session)
        await _commit(db_session, repo_id, message="jazz fusion")
        await db_session.commit()

        result = await search_by_pattern(db_session, repo_id=repo_id, pattern="jazz")
        assert result.matches[0].match_source == "message"

    async def test_pattern_branch_match_source(self, db_session: AsyncSession) -> None:
        repo_id = await _repo(db_session)
        await _commit(
            db_session, repo_id, message="unrelated commit", branch="jazz-experiment"
        )
        await db_session.commit()

        result = await search_by_pattern(db_session, repo_id=repo_id, pattern="jazz")
        assert result.matches[0].match_source == "branch"

    async def test_ask_higher_overlap_scores_higher(
        self, db_session: AsyncSession
    ) -> None:
        repo_id = await _repo(db_session)
        # high-match commit has both "jazz" and "harmony"
        await _commit(db_session, repo_id, message="jazz harmony fusion")
        # low-match commit has only "jazz"
        await _commit(db_session, repo_id, message="jazz rock experiment")
        await db_session.commit()

        result = await search_by_ask(
            db_session, repo_id=repo_id, question="jazz harmony"
        )
        assert len(result.matches) >= 2
        # First result should have the higher-scoring commit
        assert result.matches[0].score >= result.matches[1].score


# ── Layer 6 — Security ────────────────────────────────────────────────────────


class TestSecuritySearch:
    async def test_sql_injection_in_pattern_handled_safely(
        self, db_session: AsyncSession
    ) -> None:
        repo_id = await _repo(db_session)
        await _commit(db_session, repo_id, message="innocent commit")
        await db_session.commit()

        # SQL injection attempt — should return 0 matches, not crash or return all rows.
        result = await search_by_pattern(
            db_session,
            repo_id=repo_id,
            pattern="'; DROP TABLE musecli_commits; --",
        )
        assert result.matches == []

    async def test_xss_in_query_echoed_in_json_not_rendered(
        self, client: AsyncClient, db_session: AsyncSession, auth_headers: StrDict
    ) -> None:
        repo_id = await _repo(db_session, visibility="private")
        await db_session.commit()

        xss = "<script>alert(1)</script>"
        resp = await client.get(
            f"/api/repos/{repo_id}/search?q={xss}&mode=keyword",
            headers=auth_headers,
        )
        assert resp.status_code == 200
        body = resp.json()
        # The query is echoed back but must be in JSON (string), not HTML.
        assert body["query"] == xss
        assert resp.headers["content-type"].startswith("application/json")

    async def test_very_long_query_does_not_crash(
        self, db_session: AsyncSession
    ) -> None:
        repo_id = await _repo(db_session)
        await db_session.commit()

        long_query = "jazz " * 2000  # 10k chars
        result = await search_by_keyword(
            db_session, repo_id=repo_id, keyword=long_query
        )
        assert result.matches == []

    async def test_global_search_route_max_length_enforced(
        self, client: AsyncClient, auth_headers: StrDict
    ) -> None:
        # GET /api/search/repos has max_length=500 on q — over that → 422.
        long_q = "x" * 501
        resp = await client.get(f"/api/search/repos?q={long_q}", headers=auth_headers)
        assert resp.status_code == 422

    async def test_null_byte_in_pattern_handled(
        self, db_session: AsyncSession
    ) -> None:
        repo_id = await _repo(db_session)
        await db_session.commit()

        # Null bytes must not cause a crash.
        result = await search_by_pattern(
            db_session, repo_id=repo_id, pattern="foo\x00bar"
        )
        assert isinstance(result.matches, list)


# ── Layer 7 — Performance ─────────────────────────────────────────────────────


class TestPerformanceSearch:
    def test_1000_tokenize_calls_under_100ms(self) -> None:
        texts = [f"add harmony voice to track {i}" for i in range(1000)]
        start = time.perf_counter()
        for t in texts:
            _tokenize(t)
        elapsed = time.perf_counter() - start
        assert elapsed < 0.1, f"1000 _tokenize calls took {elapsed:.3f}s (expected <0.1s)"

    async def test_keyword_search_500_commits_under_500ms(
        self, db_session: AsyncSession
    ) -> None:
        repo_id = await _repo(db_session)
        for i in range(500):
            await _commit(db_session, repo_id, message=f"jazz groove rhythm {i}")
        await db_session.commit()

        start = time.perf_counter()
        result = await search_by_keyword(
            db_session, repo_id=repo_id, keyword="jazz"
        )
        elapsed = time.perf_counter() - start
        assert elapsed < 0.5, f"search over 500 commits took {elapsed:.3f}s (expected <0.5s)"
        assert len(result.matches) > 0