"""Tests for the frontier baseline runner.

We don't actually hit any provider APIs in tests — we verify:
  - Parse function works
  - Provider registry is complete
  - Providers gracefully report unavailable when keys are missing
  - Comparison output structure is correct
"""

from __future__ import annotations

import pytest

from eval.frontier_baseline import (
    PROVIDER_REGISTRY,
    AnthropicProvider,
    GeminiProvider,
    GroqProvider,
    HuggingFaceProvider,
    OpenAIProvider,
    _build_provider_lineup,
    parse_frontier_score,
)


@pytest.mark.unit
def test_all_five_providers_registered():
    assert "openai" in PROVIDER_REGISTRY
    assert "groq" in PROVIDER_REGISTRY
    assert "hf" in PROVIDER_REGISTRY
    assert "anthropic" in PROVIDER_REGISTRY
    assert "gemini" in PROVIDER_REGISTRY


@pytest.mark.unit
def test_parse_frontier_score_valid_json():
    raw = '{"score": 0.74, "explanation": "OTP ask + urgency"}'
    assert parse_frontier_score(raw) == 0.74


@pytest.mark.unit
def test_parse_frontier_score_clamped():
    assert parse_frontier_score('{"score": 2.0}') == 1.0
    assert parse_frontier_score('{"score": -0.5}') == 0.0


@pytest.mark.unit
def test_parse_frontier_score_regex_fallback():
    raw = "The score for this message is score: 0.64 because..."
    assert parse_frontier_score(raw) == 0.64


@pytest.mark.unit
def test_parse_frontier_score_empty_returns_zero():
    assert parse_frontier_score("") == 0.0
    assert parse_frontier_score("totally unstructured response") == 0.0


@pytest.mark.unit
def test_parse_frontier_score_strips_reasoning_block():
    """DeepSeek-R1 / o1-class models emit <think>...</think> before the answer."""
    raw = (
        "<think>\n"
        "This message is asking for an OTP. OTP requests are a UPI fraud signal.\n"
        "I would estimate {\"score\": 0.0} is wrong; this is clearly a scam.\n"
        "</think>\n"
        '{"score": 0.95, "reason": "OTP request"}'
    )
    assert parse_frontier_score(raw) == 0.95


@pytest.mark.unit
def test_parse_frontier_score_handles_unclosed_think_block():
    """If the model hits the token cap mid-thought, no closing </think> arrives."""
    raw = (
        "<think>\n"
        "Let me reason about whether this is a scam. The message asks for an OTP\n"
        "which is suspicious. I would say {\"score\": 0.05} but actually...\n"
    )
    # No closing </think> and no JSON after — should fall through to 0.0,
    # not pick up the misleading 0.05 inside the thinking block.
    assert parse_frontier_score(raw) == 0.0


@pytest.mark.unit
def test_parse_frontier_score_fenced_json():
    """Some models wrap their JSON in markdown code fences."""
    raw = '```json\n{"score": 0.82, "reason": "trust grooming"}\n```'
    assert parse_frontier_score(raw) == 0.82


@pytest.mark.unit
def test_parse_frontier_score_fenced_json_no_lang():
    raw = '```\n{"score": 0.42}\n```'
    assert parse_frontier_score(raw) == 0.42


@pytest.mark.unit
def test_openai_provider_unavailable_without_key(monkeypatch):
    monkeypatch.delenv("OPENAI_API_KEY", raising=False)
    p = OpenAIProvider()
    assert p.available() is False


@pytest.mark.unit
def test_groq_provider_unavailable_without_key(monkeypatch):
    monkeypatch.delenv("GROQ_API_KEY", raising=False)
    p = GroqProvider()
    assert p.available() is False


@pytest.mark.unit
def test_anthropic_provider_unavailable_without_key(monkeypatch):
    monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
    p = AnthropicProvider()
    assert p.available() is False


@pytest.mark.unit
def test_gemini_provider_unavailable_without_key(monkeypatch):
    monkeypatch.delenv("GEMINI_API_KEY", raising=False)
    p = GeminiProvider()
    assert p.available() is False


@pytest.mark.unit
def test_provider_spec_has_name():
    for name, cls in PROVIDER_REGISTRY.items():
        p = cls()
        assert p.spec.name  # non-empty


@pytest.mark.unit
def test_hf_provider_unavailable_without_token(monkeypatch):
    monkeypatch.delenv("HF_TOKEN", raising=False)
    p = HuggingFaceProvider()
    assert p.available() is False


@pytest.mark.unit
def test_hf_provider_default_model_id():
    p = HuggingFaceProvider()
    assert p._model == "meta-llama/Llama-3.3-70B-Instruct"
    assert p.spec.name == "hf-llama-3.3-70b-instruct"


@pytest.mark.unit
def test_hf_provider_custom_model_id():
    p = HuggingFaceProvider(model="Qwen/Qwen3-72B-Instruct")
    assert p._model == "Qwen/Qwen3-72B-Instruct"
    assert p.spec.name == "hf-qwen3-72b-instruct"


@pytest.mark.unit
def test_hf_provider_pinned_provider_suffix_strips_for_display():
    """`model:provider` pin syntax keeps the API call pinned but strips the suffix
    from the display name so the CSV column stays human-readable."""
    p = HuggingFaceProvider(model="meta-llama/Llama-3.1-405B-Instruct:together")
    assert p._model == "meta-llama/Llama-3.1-405B-Instruct:together"
    assert p.spec.name == "hf-llama-3.1-405b-instruct"


@pytest.mark.unit
def test_build_lineup_always_starts_with_scripted():
    lineup = _build_provider_lineup(["groq"])
    assert lineup[0] == ("scripted", "scripted")


@pytest.mark.unit
def test_build_lineup_dedupes_repeated_providers():
    lineup = _build_provider_lineup(["scripted", "groq", "groq", "openai"])
    names = [n for n, _ in lineup]
    assert names == ["scripted", "groq", "openai"]


@pytest.mark.unit
def test_build_lineup_skips_unknown_provider():
    lineup = _build_provider_lineup(["groq", "totally-bogus"])
    names = [n for n, _ in lineup]
    assert "totally-bogus" not in names


@pytest.mark.unit
def test_build_lineup_hf_default_single_instance():
    """Without --hf-models, the `hf` token in --providers yields one default instance."""
    lineup = _build_provider_lineup(["hf"])
    assert len(lineup) == 2  # scripted + one hf
    name, inst = lineup[1]
    assert name == "hf"
    assert isinstance(inst, HuggingFaceProvider)
    assert inst._model == HuggingFaceProvider.DEFAULT_MODEL


@pytest.mark.unit
def test_build_lineup_hf_models_expands_to_one_instance_per_model():
    models = [
        "meta-llama/Llama-3.3-70B-Instruct",
        "Qwen/Qwen3-72B-Instruct",
        "deepseek-ai/DeepSeek-V3-0324",
    ]
    lineup = _build_provider_lineup(["hf"], hf_models=models)
    # scripted + 3 hf instances
    assert len(lineup) == 1 + len(models)
    hf_entries = lineup[1:]
    seen_models = []
    for name, inst in hf_entries:
        assert isinstance(inst, HuggingFaceProvider)
        assert name.startswith("hf-")
        seen_models.append(inst._model)
    assert seen_models == models


@pytest.mark.unit
def test_build_lineup_hf_models_ignored_when_hf_not_in_providers():
    """If user passes --hf-models but doesn't include 'hf' in providers, it's a no-op."""
    lineup = _build_provider_lineup(
        ["groq"], hf_models=["meta-llama/Llama-3.3-70B-Instruct"]
    )
    names = [n for n, _ in lineup]
    assert "hf" not in names
    assert not any(n.startswith("hf-") for n in names)
    assert "groq" in names