Spaces:
Running
Running
| """Tests for the frontier baseline runner. | |
| We don't actually hit any provider APIs in tests — we verify: | |
| - Parse function works | |
| - Provider registry is complete | |
| - Providers gracefully report unavailable when keys are missing | |
| - Comparison output structure is correct | |
| """ | |
| from __future__ import annotations | |
| import pytest | |
| from eval.frontier_baseline import ( | |
| PROVIDER_REGISTRY, | |
| AnthropicProvider, | |
| GeminiProvider, | |
| GroqProvider, | |
| HuggingFaceProvider, | |
| OpenAIProvider, | |
| _build_provider_lineup, | |
| parse_frontier_score, | |
| ) | |
| def test_all_five_providers_registered(): | |
| assert "openai" in PROVIDER_REGISTRY | |
| assert "groq" in PROVIDER_REGISTRY | |
| assert "hf" in PROVIDER_REGISTRY | |
| assert "anthropic" in PROVIDER_REGISTRY | |
| assert "gemini" in PROVIDER_REGISTRY | |
| def test_parse_frontier_score_valid_json(): | |
| raw = '{"score": 0.74, "explanation": "OTP ask + urgency"}' | |
| assert parse_frontier_score(raw) == 0.74 | |
| def test_parse_frontier_score_clamped(): | |
| assert parse_frontier_score('{"score": 2.0}') == 1.0 | |
| assert parse_frontier_score('{"score": -0.5}') == 0.0 | |
| def test_parse_frontier_score_regex_fallback(): | |
| raw = "The score for this message is score: 0.64 because..." | |
| assert parse_frontier_score(raw) == 0.64 | |
| def test_parse_frontier_score_empty_returns_zero(): | |
| assert parse_frontier_score("") == 0.0 | |
| assert parse_frontier_score("totally unstructured response") == 0.0 | |
| def test_parse_frontier_score_strips_reasoning_block(): | |
| """DeepSeek-R1 / o1-class models emit <think>...</think> before the answer.""" | |
| raw = ( | |
| "<think>\n" | |
| "This message is asking for an OTP. OTP requests are a UPI fraud signal.\n" | |
| "I would estimate {\"score\": 0.0} is wrong; this is clearly a scam.\n" | |
| "</think>\n" | |
| '{"score": 0.95, "reason": "OTP request"}' | |
| ) | |
| assert parse_frontier_score(raw) == 0.95 | |
| def test_parse_frontier_score_handles_unclosed_think_block(): | |
| """If the model hits the token cap mid-thought, no closing </think> arrives.""" | |
| raw = ( | |
| "<think>\n" | |
| "Let me reason about whether this is a scam. The message asks for an OTP\n" | |
| "which is suspicious. I would say {\"score\": 0.05} but actually...\n" | |
| ) | |
| # No closing </think> and no JSON after — should fall through to 0.0, | |
| # not pick up the misleading 0.05 inside the thinking block. | |
| assert parse_frontier_score(raw) == 0.0 | |
| def test_parse_frontier_score_fenced_json(): | |
| """Some models wrap their JSON in markdown code fences.""" | |
| raw = '```json\n{"score": 0.82, "reason": "trust grooming"}\n```' | |
| assert parse_frontier_score(raw) == 0.82 | |
| def test_parse_frontier_score_fenced_json_no_lang(): | |
| raw = '```\n{"score": 0.42}\n```' | |
| assert parse_frontier_score(raw) == 0.42 | |
| def test_openai_provider_unavailable_without_key(monkeypatch): | |
| monkeypatch.delenv("OPENAI_API_KEY", raising=False) | |
| p = OpenAIProvider() | |
| assert p.available() is False | |
| def test_groq_provider_unavailable_without_key(monkeypatch): | |
| monkeypatch.delenv("GROQ_API_KEY", raising=False) | |
| p = GroqProvider() | |
| assert p.available() is False | |
| def test_anthropic_provider_unavailable_without_key(monkeypatch): | |
| monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False) | |
| p = AnthropicProvider() | |
| assert p.available() is False | |
| def test_gemini_provider_unavailable_without_key(monkeypatch): | |
| monkeypatch.delenv("GEMINI_API_KEY", raising=False) | |
| p = GeminiProvider() | |
| assert p.available() is False | |
| def test_provider_spec_has_name(): | |
| for name, cls in PROVIDER_REGISTRY.items(): | |
| p = cls() | |
| assert p.spec.name # non-empty | |
| def test_hf_provider_unavailable_without_token(monkeypatch): | |
| monkeypatch.delenv("HF_TOKEN", raising=False) | |
| p = HuggingFaceProvider() | |
| assert p.available() is False | |
| def test_hf_provider_default_model_id(): | |
| p = HuggingFaceProvider() | |
| assert p._model == "meta-llama/Llama-3.3-70B-Instruct" | |
| assert p.spec.name == "hf-llama-3.3-70b-instruct" | |
| def test_hf_provider_custom_model_id(): | |
| p = HuggingFaceProvider(model="Qwen/Qwen3-72B-Instruct") | |
| assert p._model == "Qwen/Qwen3-72B-Instruct" | |
| assert p.spec.name == "hf-qwen3-72b-instruct" | |
| def test_hf_provider_pinned_provider_suffix_strips_for_display(): | |
| """`model:provider` pin syntax keeps the API call pinned but strips the suffix | |
| from the display name so the CSV column stays human-readable.""" | |
| p = HuggingFaceProvider(model="meta-llama/Llama-3.1-405B-Instruct:together") | |
| assert p._model == "meta-llama/Llama-3.1-405B-Instruct:together" | |
| assert p.spec.name == "hf-llama-3.1-405b-instruct" | |
| def test_build_lineup_always_starts_with_scripted(): | |
| lineup = _build_provider_lineup(["groq"]) | |
| assert lineup[0] == ("scripted", "scripted") | |
| def test_build_lineup_dedupes_repeated_providers(): | |
| lineup = _build_provider_lineup(["scripted", "groq", "groq", "openai"]) | |
| names = [n for n, _ in lineup] | |
| assert names == ["scripted", "groq", "openai"] | |
| def test_build_lineup_skips_unknown_provider(): | |
| lineup = _build_provider_lineup(["groq", "totally-bogus"]) | |
| names = [n for n, _ in lineup] | |
| assert "totally-bogus" not in names | |
| def test_build_lineup_hf_default_single_instance(): | |
| """Without --hf-models, the `hf` token in --providers yields one default instance.""" | |
| lineup = _build_provider_lineup(["hf"]) | |
| assert len(lineup) == 2 # scripted + one hf | |
| name, inst = lineup[1] | |
| assert name == "hf" | |
| assert isinstance(inst, HuggingFaceProvider) | |
| assert inst._model == HuggingFaceProvider.DEFAULT_MODEL | |
| def test_build_lineup_hf_models_expands_to_one_instance_per_model(): | |
| models = [ | |
| "meta-llama/Llama-3.3-70B-Instruct", | |
| "Qwen/Qwen3-72B-Instruct", | |
| "deepseek-ai/DeepSeek-V3-0324", | |
| ] | |
| lineup = _build_provider_lineup(["hf"], hf_models=models) | |
| # scripted + 3 hf instances | |
| assert len(lineup) == 1 + len(models) | |
| hf_entries = lineup[1:] | |
| seen_models = [] | |
| for name, inst in hf_entries: | |
| assert isinstance(inst, HuggingFaceProvider) | |
| assert name.startswith("hf-") | |
| seen_models.append(inst._model) | |
| assert seen_models == models | |
| def test_build_lineup_hf_models_ignored_when_hf_not_in_providers(): | |
| """If user passes --hf-models but doesn't include 'hf' in providers, it's a no-op.""" | |
| lineup = _build_provider_lineup( | |
| ["groq"], hf_models=["meta-llama/Llama-3.3-70B-Instruct"] | |
| ) | |
| names = [n for n, _ in lineup] | |
| assert "hf" not in names | |
| assert not any(n.startswith("hf-") for n in names) | |
| assert "groq" in names | |