"""Tests for the frontier baseline runner. We don't actually hit any provider APIs in tests — we verify: - Parse function works - Provider registry is complete - Providers gracefully report unavailable when keys are missing - Comparison output structure is correct """ from __future__ import annotations import pytest from eval.frontier_baseline import ( PROVIDER_REGISTRY, AnthropicProvider, GeminiProvider, GroqProvider, HuggingFaceProvider, OpenAIProvider, _build_provider_lineup, parse_frontier_score, ) @pytest.mark.unit def test_all_five_providers_registered(): assert "openai" in PROVIDER_REGISTRY assert "groq" in PROVIDER_REGISTRY assert "hf" in PROVIDER_REGISTRY assert "anthropic" in PROVIDER_REGISTRY assert "gemini" in PROVIDER_REGISTRY @pytest.mark.unit def test_parse_frontier_score_valid_json(): raw = '{"score": 0.74, "explanation": "OTP ask + urgency"}' assert parse_frontier_score(raw) == 0.74 @pytest.mark.unit def test_parse_frontier_score_clamped(): assert parse_frontier_score('{"score": 2.0}') == 1.0 assert parse_frontier_score('{"score": -0.5}') == 0.0 @pytest.mark.unit def test_parse_frontier_score_regex_fallback(): raw = "The score for this message is score: 0.64 because..." assert parse_frontier_score(raw) == 0.64 @pytest.mark.unit def test_parse_frontier_score_empty_returns_zero(): assert parse_frontier_score("") == 0.0 assert parse_frontier_score("totally unstructured response") == 0.0 @pytest.mark.unit def test_parse_frontier_score_strips_reasoning_block(): """DeepSeek-R1 / o1-class models emit ... before the answer.""" raw = ( "\n" "This message is asking for an OTP. OTP requests are a UPI fraud signal.\n" "I would estimate {\"score\": 0.0} is wrong; this is clearly a scam.\n" "\n" '{"score": 0.95, "reason": "OTP request"}' ) assert parse_frontier_score(raw) == 0.95 @pytest.mark.unit def test_parse_frontier_score_handles_unclosed_think_block(): """If the model hits the token cap mid-thought, no closing arrives.""" raw = ( "\n" "Let me reason about whether this is a scam. The message asks for an OTP\n" "which is suspicious. I would say {\"score\": 0.05} but actually...\n" ) # No closing and no JSON after — should fall through to 0.0, # not pick up the misleading 0.05 inside the thinking block. assert parse_frontier_score(raw) == 0.0 @pytest.mark.unit def test_parse_frontier_score_fenced_json(): """Some models wrap their JSON in markdown code fences.""" raw = '```json\n{"score": 0.82, "reason": "trust grooming"}\n```' assert parse_frontier_score(raw) == 0.82 @pytest.mark.unit def test_parse_frontier_score_fenced_json_no_lang(): raw = '```\n{"score": 0.42}\n```' assert parse_frontier_score(raw) == 0.42 @pytest.mark.unit def test_openai_provider_unavailable_without_key(monkeypatch): monkeypatch.delenv("OPENAI_API_KEY", raising=False) p = OpenAIProvider() assert p.available() is False @pytest.mark.unit def test_groq_provider_unavailable_without_key(monkeypatch): monkeypatch.delenv("GROQ_API_KEY", raising=False) p = GroqProvider() assert p.available() is False @pytest.mark.unit def test_anthropic_provider_unavailable_without_key(monkeypatch): monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False) p = AnthropicProvider() assert p.available() is False @pytest.mark.unit def test_gemini_provider_unavailable_without_key(monkeypatch): monkeypatch.delenv("GEMINI_API_KEY", raising=False) p = GeminiProvider() assert p.available() is False @pytest.mark.unit def test_provider_spec_has_name(): for name, cls in PROVIDER_REGISTRY.items(): p = cls() assert p.spec.name # non-empty @pytest.mark.unit def test_hf_provider_unavailable_without_token(monkeypatch): monkeypatch.delenv("HF_TOKEN", raising=False) p = HuggingFaceProvider() assert p.available() is False @pytest.mark.unit def test_hf_provider_default_model_id(): p = HuggingFaceProvider() assert p._model == "meta-llama/Llama-3.3-70B-Instruct" assert p.spec.name == "hf-llama-3.3-70b-instruct" @pytest.mark.unit def test_hf_provider_custom_model_id(): p = HuggingFaceProvider(model="Qwen/Qwen3-72B-Instruct") assert p._model == "Qwen/Qwen3-72B-Instruct" assert p.spec.name == "hf-qwen3-72b-instruct" @pytest.mark.unit def test_hf_provider_pinned_provider_suffix_strips_for_display(): """`model:provider` pin syntax keeps the API call pinned but strips the suffix from the display name so the CSV column stays human-readable.""" p = HuggingFaceProvider(model="meta-llama/Llama-3.1-405B-Instruct:together") assert p._model == "meta-llama/Llama-3.1-405B-Instruct:together" assert p.spec.name == "hf-llama-3.1-405b-instruct" @pytest.mark.unit def test_build_lineup_always_starts_with_scripted(): lineup = _build_provider_lineup(["groq"]) assert lineup[0] == ("scripted", "scripted") @pytest.mark.unit def test_build_lineup_dedupes_repeated_providers(): lineup = _build_provider_lineup(["scripted", "groq", "groq", "openai"]) names = [n for n, _ in lineup] assert names == ["scripted", "groq", "openai"] @pytest.mark.unit def test_build_lineup_skips_unknown_provider(): lineup = _build_provider_lineup(["groq", "totally-bogus"]) names = [n for n, _ in lineup] assert "totally-bogus" not in names @pytest.mark.unit def test_build_lineup_hf_default_single_instance(): """Without --hf-models, the `hf` token in --providers yields one default instance.""" lineup = _build_provider_lineup(["hf"]) assert len(lineup) == 2 # scripted + one hf name, inst = lineup[1] assert name == "hf" assert isinstance(inst, HuggingFaceProvider) assert inst._model == HuggingFaceProvider.DEFAULT_MODEL @pytest.mark.unit def test_build_lineup_hf_models_expands_to_one_instance_per_model(): models = [ "meta-llama/Llama-3.3-70B-Instruct", "Qwen/Qwen3-72B-Instruct", "deepseek-ai/DeepSeek-V3-0324", ] lineup = _build_provider_lineup(["hf"], hf_models=models) # scripted + 3 hf instances assert len(lineup) == 1 + len(models) hf_entries = lineup[1:] seen_models = [] for name, inst in hf_entries: assert isinstance(inst, HuggingFaceProvider) assert name.startswith("hf-") seen_models.append(inst._model) assert seen_models == models @pytest.mark.unit def test_build_lineup_hf_models_ignored_when_hf_not_in_providers(): """If user passes --hf-models but doesn't include 'hf' in providers, it's a no-op.""" lineup = _build_provider_lineup( ["groq"], hf_models=["meta-llama/Llama-3.3-70B-Instruct"] ) names = [n for n, _ in lineup] assert "hf" not in names assert not any(n.startswith("hf-") for n in names) assert "groq" in names