chakravyuh / tests /test_frontier_baseline.py
UjjwalPardeshi
deploy: latest main to HF Space
03815d6
"""Tests for the frontier baseline runner.
We don't actually hit any provider APIs in tests — we verify:
- Parse function works
- Provider registry is complete
- Providers gracefully report unavailable when keys are missing
- Comparison output structure is correct
"""
from __future__ import annotations
import pytest
from eval.frontier_baseline import (
PROVIDER_REGISTRY,
AnthropicProvider,
GeminiProvider,
GroqProvider,
HuggingFaceProvider,
OpenAIProvider,
_build_provider_lineup,
parse_frontier_score,
)
@pytest.mark.unit
def test_all_five_providers_registered():
assert "openai" in PROVIDER_REGISTRY
assert "groq" in PROVIDER_REGISTRY
assert "hf" in PROVIDER_REGISTRY
assert "anthropic" in PROVIDER_REGISTRY
assert "gemini" in PROVIDER_REGISTRY
@pytest.mark.unit
def test_parse_frontier_score_valid_json():
raw = '{"score": 0.74, "explanation": "OTP ask + urgency"}'
assert parse_frontier_score(raw) == 0.74
@pytest.mark.unit
def test_parse_frontier_score_clamped():
assert parse_frontier_score('{"score": 2.0}') == 1.0
assert parse_frontier_score('{"score": -0.5}') == 0.0
@pytest.mark.unit
def test_parse_frontier_score_regex_fallback():
raw = "The score for this message is score: 0.64 because..."
assert parse_frontier_score(raw) == 0.64
@pytest.mark.unit
def test_parse_frontier_score_empty_returns_zero():
assert parse_frontier_score("") == 0.0
assert parse_frontier_score("totally unstructured response") == 0.0
@pytest.mark.unit
def test_parse_frontier_score_strips_reasoning_block():
"""DeepSeek-R1 / o1-class models emit <think>...</think> before the answer."""
raw = (
"<think>\n"
"This message is asking for an OTP. OTP requests are a UPI fraud signal.\n"
"I would estimate {\"score\": 0.0} is wrong; this is clearly a scam.\n"
"</think>\n"
'{"score": 0.95, "reason": "OTP request"}'
)
assert parse_frontier_score(raw) == 0.95
@pytest.mark.unit
def test_parse_frontier_score_handles_unclosed_think_block():
"""If the model hits the token cap mid-thought, no closing </think> arrives."""
raw = (
"<think>\n"
"Let me reason about whether this is a scam. The message asks for an OTP\n"
"which is suspicious. I would say {\"score\": 0.05} but actually...\n"
)
# No closing </think> and no JSON after — should fall through to 0.0,
# not pick up the misleading 0.05 inside the thinking block.
assert parse_frontier_score(raw) == 0.0
@pytest.mark.unit
def test_parse_frontier_score_fenced_json():
"""Some models wrap their JSON in markdown code fences."""
raw = '```json\n{"score": 0.82, "reason": "trust grooming"}\n```'
assert parse_frontier_score(raw) == 0.82
@pytest.mark.unit
def test_parse_frontier_score_fenced_json_no_lang():
raw = '```\n{"score": 0.42}\n```'
assert parse_frontier_score(raw) == 0.42
@pytest.mark.unit
def test_openai_provider_unavailable_without_key(monkeypatch):
monkeypatch.delenv("OPENAI_API_KEY", raising=False)
p = OpenAIProvider()
assert p.available() is False
@pytest.mark.unit
def test_groq_provider_unavailable_without_key(monkeypatch):
monkeypatch.delenv("GROQ_API_KEY", raising=False)
p = GroqProvider()
assert p.available() is False
@pytest.mark.unit
def test_anthropic_provider_unavailable_without_key(monkeypatch):
monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
p = AnthropicProvider()
assert p.available() is False
@pytest.mark.unit
def test_gemini_provider_unavailable_without_key(monkeypatch):
monkeypatch.delenv("GEMINI_API_KEY", raising=False)
p = GeminiProvider()
assert p.available() is False
@pytest.mark.unit
def test_provider_spec_has_name():
for name, cls in PROVIDER_REGISTRY.items():
p = cls()
assert p.spec.name # non-empty
@pytest.mark.unit
def test_hf_provider_unavailable_without_token(monkeypatch):
monkeypatch.delenv("HF_TOKEN", raising=False)
p = HuggingFaceProvider()
assert p.available() is False
@pytest.mark.unit
def test_hf_provider_default_model_id():
p = HuggingFaceProvider()
assert p._model == "meta-llama/Llama-3.3-70B-Instruct"
assert p.spec.name == "hf-llama-3.3-70b-instruct"
@pytest.mark.unit
def test_hf_provider_custom_model_id():
p = HuggingFaceProvider(model="Qwen/Qwen3-72B-Instruct")
assert p._model == "Qwen/Qwen3-72B-Instruct"
assert p.spec.name == "hf-qwen3-72b-instruct"
@pytest.mark.unit
def test_hf_provider_pinned_provider_suffix_strips_for_display():
"""`model:provider` pin syntax keeps the API call pinned but strips the suffix
from the display name so the CSV column stays human-readable."""
p = HuggingFaceProvider(model="meta-llama/Llama-3.1-405B-Instruct:together")
assert p._model == "meta-llama/Llama-3.1-405B-Instruct:together"
assert p.spec.name == "hf-llama-3.1-405b-instruct"
@pytest.mark.unit
def test_build_lineup_always_starts_with_scripted():
lineup = _build_provider_lineup(["groq"])
assert lineup[0] == ("scripted", "scripted")
@pytest.mark.unit
def test_build_lineup_dedupes_repeated_providers():
lineup = _build_provider_lineup(["scripted", "groq", "groq", "openai"])
names = [n for n, _ in lineup]
assert names == ["scripted", "groq", "openai"]
@pytest.mark.unit
def test_build_lineup_skips_unknown_provider():
lineup = _build_provider_lineup(["groq", "totally-bogus"])
names = [n for n, _ in lineup]
assert "totally-bogus" not in names
@pytest.mark.unit
def test_build_lineup_hf_default_single_instance():
"""Without --hf-models, the `hf` token in --providers yields one default instance."""
lineup = _build_provider_lineup(["hf"])
assert len(lineup) == 2 # scripted + one hf
name, inst = lineup[1]
assert name == "hf"
assert isinstance(inst, HuggingFaceProvider)
assert inst._model == HuggingFaceProvider.DEFAULT_MODEL
@pytest.mark.unit
def test_build_lineup_hf_models_expands_to_one_instance_per_model():
models = [
"meta-llama/Llama-3.3-70B-Instruct",
"Qwen/Qwen3-72B-Instruct",
"deepseek-ai/DeepSeek-V3-0324",
]
lineup = _build_provider_lineup(["hf"], hf_models=models)
# scripted + 3 hf instances
assert len(lineup) == 1 + len(models)
hf_entries = lineup[1:]
seen_models = []
for name, inst in hf_entries:
assert isinstance(inst, HuggingFaceProvider)
assert name.startswith("hf-")
seen_models.append(inst._model)
assert seen_models == models
@pytest.mark.unit
def test_build_lineup_hf_models_ignored_when_hf_not_in_providers():
"""If user passes --hf-models but doesn't include 'hf' in providers, it's a no-op."""
lineup = _build_provider_lineup(
["groq"], hf_models=["meta-llama/Llama-3.3-70B-Instruct"]
)
names = [n for n, _ in lineup]
assert "hf" not in names
assert not any(n.startswith("hf-") for n in names)
assert "groq" in names