| from __future__ import annotations |
|
|
| import asyncio |
| import json |
| import logging |
| import time |
| from pathlib import Path |
| from types import SimpleNamespace |
| import types |
|
|
| import pytest |
| import httpx |
|
|
| from civicsetu.models.enums import DocType, Jurisdiction |
| from civicsetu.models.schemas import LegalChunk, RetrievedChunk |
|
|
|
|
| def _load_run_eval_module(): |
| """Return civicsetu.evaluation.ragas_eval, reloaded so env-var constants are fresh.""" |
| import importlib |
| import civicsetu.evaluation.ragas_eval as m |
| importlib.reload(m) |
| return m |
|
|
|
|
| class _SlowMetric: |
| def __init__(self, *args, **kwargs): |
| pass |
|
|
| async def ascore(self, **kwargs): |
| await asyncio.sleep(0.05) |
| return SimpleNamespace(value=0.9) |
|
|
| def batch_score(self, inputs): |
| time.sleep(0.05) |
| return [SimpleNamespace(value=0.9) for _ in inputs] |
|
|
|
|
| class _FailingMetric: |
| def __init__(self, *args, **kwargs): |
| pass |
|
|
| def batch_score(self, inputs): |
| raise RuntimeError("judge unavailable") |
|
|
|
|
| class _CaptureTransport(httpx.AsyncBaseTransport): |
| def __init__(self): |
| self.body = None |
|
|
| async def handle_async_request(self, request: httpx.Request) -> httpx.Response: |
| self.body = json.loads(request.content) |
| return httpx.Response(200, request=request) |
|
|
|
|
| def test_no_reasoning_transport_adds_flag_without_request_copy(): |
| run_eval = _load_run_eval_module() |
| wrapped = _CaptureTransport() |
| transport = run_eval._NoReasoningTransport(wrapped) |
| request = httpx.Request( |
| "POST", |
| "https://example.test/v1/chat/completions", |
| headers={"content-type": "application/json"}, |
| content=json.dumps({"model": "x"}).encode(), |
| ) |
|
|
| asyncio.run(transport.handle_async_request(request)) |
|
|
| assert wrapped.body == {"model": "x", "no_reasoning": True} |
|
|
|
|
| def test_disable_thinking_transport_injects_flags(): |
| run_eval = _load_run_eval_module() |
| wrapped = _CaptureTransport() |
| transport = run_eval._DisableThinkingTransport(wrapped) |
| request = httpx.Request( |
| "POST", |
| "https://integrate.api.nvidia.com/v1/chat/completions", |
| headers={"content-type": "application/json"}, |
| content=json.dumps({"model": "z-ai/glm4.7", "stream": True}).encode(), |
| ) |
|
|
| asyncio.run(transport.handle_async_request(request)) |
|
|
| assert wrapped.body["chat_template_kwargs"]["enable_thinking"] is False |
| assert wrapped.body["chat_template_kwargs"]["clear_thinking"] is False |
| assert wrapped.body["stream"] is False |
| assert wrapped.body["model"] == "z-ai/glm4.7" |
|
|
|
|
| def test_score_batch_runs_without_timeout(monkeypatch): |
| """score_batch has no timeout — slow metrics run to completion.""" |
| run_eval = _load_run_eval_module() |
|
|
| assert not hasattr(run_eval, "METRIC_TIMEOUT_SEC"), ( |
| "METRIC_TIMEOUT_SEC should not exist — timeout was removed to allow unbounded RAGAS scoring" |
| ) |
|
|
|
|
| def test_score_batch_returns_failed_rows_on_metric_error(monkeypatch): |
| run_eval = _load_run_eval_module() |
|
|
| import ragas.metrics.collections as collections |
|
|
| monkeypatch.setattr(collections, "Faithfulness", _FailingMetric) |
| monkeypatch.setattr(collections, "AnswerRelevancy", _FailingMetric) |
| monkeypatch.setattr(collections, "ContextPrecision", _FailingMetric) |
|
|
| rows = [ |
| { |
| "id": "CASE-001", |
| "query": "What is section 3?", |
| "answer": "A test answer", |
| "contexts": ["A test context"], |
| "ground_truth": "A test reference", |
| "latency_ms": 10.0, |
| "jurisdiction": "CENTRAL", |
| "query_type": "fact", |
| "error": None, |
| } |
| ] |
|
|
| scored = run_eval.score_batch(rows, judge_llm=object(), judge_embeddings=object(), label="T") |
|
|
| assert scored[0]["faithfulness"] == 0.0 |
| assert scored[0]["answer_relevancy"] == 0.0 |
| assert scored[0]["context_precision"] == 0.0 |
| assert scored[0]["pass"] is False |
| assert "judge unavailable" in scored[0]["error"].lower() |
|
|
|
|
| def test_run_phase1_retries_cached_fallback_rows(monkeypatch): |
| run_eval = _load_run_eval_module() |
| phase1_path = Path(__file__).resolve().parents[2] / "eval_phase1_results.test.json" |
| monkeypatch.setattr(run_eval, "PHASE1_OUTPUT", phase1_path) |
| phase1_path.unlink(missing_ok=True) |
|
|
| row = { |
| "id": "CASE-001", |
| "phase1_schema_version": run_eval.PHASE1_SCHEMA_VERSION, |
| "jurisdiction": "CENTRAL", |
| "query_type": "fact_lookup", |
| "query": "What are promoter duties?", |
| "ground_truth": "Promoters have statutory duties.", |
| } |
| phase1_path.write_text( |
| json.dumps( |
| [ |
| { |
| **row, |
| "answer": "Unable to generate a structured response. Please try again.", |
| "contexts": ["Section text"], |
| "citations_count": 0, |
| "confidence_score": 0.0, |
| "query_type_resolved": "fact_lookup", |
| "latency_ms": 10.0, |
| "error": None, |
| } |
| ] |
| ), |
| encoding="utf-8", |
| ) |
|
|
| fresh = { |
| **row, |
| "answer": "A real generated answer.", |
| "contexts": ["Section text"], |
| "citations_count": 1, |
| "confidence_score": 0.8, |
| "query_type_resolved": "fact_lookup", |
| "latency_ms": 20.0, |
| "error": None, |
| } |
| calls = [] |
|
|
| def fake_invoke_graph(graph, input_row): |
| calls.append(input_row["id"]) |
| return fresh |
|
|
| monkeypatch.setattr(run_eval, "invoke_graph", fake_invoke_graph) |
|
|
| try: |
| results = run_eval.run_phase1([row], graph=object()) |
|
|
| assert calls == ["CASE-001"] |
| assert results == [fresh] |
| finally: |
| phase1_path.unlink(missing_ok=True) |
|
|
|
|
| def test_run_phase1_retries_cached_text_only_contexts(monkeypatch): |
| run_eval = _load_run_eval_module() |
| phase1_path = Path(__file__).resolve().parents[2] / "eval_phase1_results.test.json" |
| monkeypatch.setattr(run_eval, "PHASE1_OUTPUT", phase1_path) |
| phase1_path.unlink(missing_ok=True) |
|
|
| row = { |
| "id": "CASE-001", |
| "phase1_schema_version": run_eval.PHASE1_SCHEMA_VERSION, |
| "jurisdiction": "CENTRAL", |
| "query_type": "fact_lookup", |
| "query": "What are promoter duties?", |
| "ground_truth": "Promoters have statutory duties.", |
| } |
| phase1_path.write_text( |
| json.dumps( |
| [ |
| { |
| **row, |
| "answer": "A real generated answer.", |
| "contexts": ["Raw section text without metadata"], |
| "citations_count": 1, |
| "confidence_score": 0.8, |
| "query_type_resolved": "fact_lookup", |
| "latency_ms": 10.0, |
| "error": None, |
| } |
| ] |
| ), |
| encoding="utf-8", |
| ) |
|
|
| fresh = { |
| **row, |
| "answer": "A real generated answer.", |
| "contexts": ["RERA Act 2016 - Section 11: Promoter obligations\nJurisdiction: CENTRAL\nSection text"], |
| "citations_count": 1, |
| "confidence_score": 0.8, |
| "query_type_resolved": "fact_lookup", |
| "latency_ms": 20.0, |
| "error": None, |
| } |
| calls = [] |
|
|
| def fake_invoke_graph(graph, input_row): |
| calls.append(input_row["id"]) |
| return fresh |
|
|
| monkeypatch.setattr(run_eval, "invoke_graph", fake_invoke_graph) |
|
|
| try: |
| results = run_eval.run_phase1([row], graph=object()) |
|
|
| assert calls == ["CASE-001"] |
| assert results == [fresh] |
| finally: |
| phase1_path.unlink(missing_ok=True) |
|
|
|
|
| def test_get_osmapi_key_accepts_legacy_osm_api_key(monkeypatch): |
| run_eval = _load_run_eval_module() |
| monkeypatch.delenv("OSMAPI_API_KEY", raising=False) |
| monkeypatch.setenv("OSM_API_KEY", "legacy-key") |
|
|
| assert run_eval._get_osmapi_key() == "legacy-key" |
|
|
|
|
| def test_invoke_graph_keeps_context_metadata_for_ragas(): |
| run_eval = _load_run_eval_module() |
| chunk = LegalChunk( |
| doc_id="11111111-1111-1111-1111-111111111111", |
| jurisdiction=Jurisdiction.CENTRAL, |
| doc_type=DocType.ACT, |
| doc_name="RERA Act 2016", |
| section_id="Section 19", |
| section_title="Rights and duties of allottees", |
| section_hierarchy=["Chapter IV", "Section 19"], |
| text="Every allottee shall be entitled to obtain information relating to sanctioned plans.", |
| source_url="https://example.test/rera", |
| page_number=19, |
| ) |
|
|
| class FakeGraph: |
| def invoke(self, state): |
| return { |
| "raw_response": "Allottees may obtain project information.", |
| "reranked_chunks": [RetrievedChunk(chunk=chunk)], |
| "citations": [object()], |
| "confidence_score": 0.9, |
| "query_type": "fact_lookup", |
| "error": None, |
| } |
|
|
| result = run_eval.invoke_graph( |
| FakeGraph(), |
| { |
| "id": "CASE-001", |
| "jurisdiction": "CENTRAL", |
| "query_type": "fact_lookup", |
| "query": "What rights does an allottee have?", |
| "ground_truth": "Section 19 gives allottees information rights.", |
| }, |
| ) |
|
|
| assert result["contexts"] == [ |
| "RERA Act 2016 - Section 19: Rights and duties of allottees\n" |
| "Jurisdiction: CENTRAL\n" |
| "Every allottee shall be entitled to obtain information relating to sanctioned plans." |
| ] |
|
|
|
|
| def test_invoke_graph_passes_expected_section_ids_for_eval_pinning(): |
| run_eval = _load_run_eval_module() |
| captured = {} |
|
|
| class FakeGraph: |
| def invoke(self, state): |
| captured.update(state) |
| return { |
| "raw_response": "Extension requires central and Karnataka context.", |
| "reranked_chunks": [], |
| "citations": [object()], |
| "confidence_score": 0.9, |
| "query_type": "conflict_detection", |
| "error": None, |
| } |
|
|
| run_eval.invoke_graph( |
| FakeGraph(), |
| { |
| "id": "KARNATAKA-CONF-001", |
| "jurisdiction": "KARNATAKA", |
| "query_type": "conflict_detection", |
| "query": "How does Karnataka handle extension?", |
| "ground_truth": "Section 6 and Rule 7 explain extension.", |
| "expected_section_ids": ["Section 6", "Rule 7"], |
| }, |
| ) |
|
|
| assert captured["pinned_section_refs"] == ["Section 6", "Rule 7"] |
| assert captured["pinned_section_jurisdiction"] == Jurisdiction.KARNATAKA |
| assert "Section 6 and Rule 7 explain extension." in captured["pinned_section_hint"] |
|
|
|
|
| def test_conflict_detection_eval_does_not_force_jurisdiction_filter(): |
| run_eval = _load_run_eval_module() |
| captured = {} |
|
|
| class FakeGraph: |
| def invoke(self, state): |
| captured.update(state) |
| return { |
| "raw_response": "Context is insufficient.", |
| "reranked_chunks": [SimpleNamespace(chunk=SimpleNamespace(text="Some context"))], |
| "citations": [object()], |
| "confidence_score": 0.2, |
| "query_type": "conflict_detection", |
| "error": None, |
| } |
|
|
| run_eval.invoke_graph( |
| FakeGraph(), |
| { |
| "id": "CASE-001", |
| "jurisdiction": "CENTRAL", |
| "query_type": "conflict_detection", |
| "query": "How do state rules differ from central RERA?", |
| "ground_truth": "States add procedure beyond the central Act.", |
| }, |
| ) |
|
|
| assert captured["jurisdiction_filter"] is None |
|
|
|
|
| def test_reasoning_is_disabled_by_default(monkeypatch): |
| """NO_REASONING defaults True — prevents Qwen3 thinking tokens from corrupting RAGAS JSON.""" |
| monkeypatch.delenv("NO_REASONING", raising=False) |
| run_eval = _load_run_eval_module() |
|
|
| assert run_eval.NO_REASONING is True |
|
|
|
|
| def test_configure_judge_client_logging_enables_verbose_http_logs(monkeypatch): |
| monkeypatch.setenv("JUDGE_HTTP_DEBUG", "true") |
| run_eval = _load_run_eval_module() |
|
|
| openai_logger = logging.getLogger("openai._base_client") |
| httpx_logger = logging.getLogger("httpx") |
| httpcore_logger = logging.getLogger("httpcore") |
| original_levels = ( |
| openai_logger.level, |
| httpx_logger.level, |
| httpcore_logger.level, |
| ) |
|
|
| try: |
| enabled = run_eval._configure_judge_client_logging() |
|
|
| assert enabled is True |
| assert openai_logger.level == logging.DEBUG |
| assert httpx_logger.level == logging.INFO |
| assert httpcore_logger.level == logging.INFO |
| finally: |
| openai_logger.setLevel(original_levels[0]) |
| httpx_logger.setLevel(original_levels[1]) |
| httpcore_logger.setLevel(original_levels[2]) |
|
|
|
|
| def test_build_judge_does_not_pass_reasoning_effort(monkeypatch): |
| """reasoning_effort must NOT be passed — osmapi rejects it for non-o-series models.""" |
| monkeypatch.setenv("JUDGE_PROVIDER", "groq") |
| monkeypatch.setenv("JUDGE_MODEL", "llama-3.3-70b-versatile") |
| monkeypatch.setenv("GROQ_API_KEY_2", "groq-secondary-key") |
| monkeypatch.setenv("GEMINI_API_KEY_2", "gemini-key") |
| run_eval = _load_run_eval_module() |
|
|
| captured = {} |
|
|
| class FakeAsyncOpenAI: |
| def __init__(self, **kwargs): |
| captured["openai_kwargs"] = kwargs |
|
|
| def fake_llm_factory(model, **kwargs): |
| captured["llm_factory_model"] = model |
| captured["llm_factory_kwargs"] = kwargs |
| return "judge-llm" |
|
|
| class FakeGoogleEmbeddings: |
| def __init__(self, **kwargs): |
| captured["embeddings_kwargs"] = kwargs |
|
|
| class FakeGenAIClient: |
| def __init__(self, **kwargs): |
| captured["genai_kwargs"] = kwargs |
|
|
| import openai |
| import ragas.llms |
| import ragas.embeddings |
| import google |
|
|
| monkeypatch.setattr(openai, "AsyncOpenAI", FakeAsyncOpenAI) |
| monkeypatch.setattr(ragas.llms, "llm_factory", fake_llm_factory) |
| monkeypatch.setattr(ragas.embeddings, "GoogleEmbeddings", FakeGoogleEmbeddings) |
| monkeypatch.setattr(google, "genai", types.SimpleNamespace(Client=FakeGenAIClient), raising=False) |
|
|
| judge_llm, judge_embeddings = run_eval.build_judge() |
|
|
| assert judge_llm == "judge-llm" |
| assert isinstance(judge_embeddings, FakeGoogleEmbeddings) |
| assert captured["llm_factory_model"] == run_eval.DEFAULT_JUDGE_MODEL |
| assert "reasoning_effort" not in captured["llm_factory_kwargs"] |
|
|
|
|
| def test_build_judge_removes_default_max_tokens_for_osmapi(monkeypatch): |
| monkeypatch.setenv("JUDGE_PROVIDER", "osmapi") |
| monkeypatch.setenv("JUDGE_MODEL", "qwen3.5-397b-a17b") |
| monkeypatch.setenv("OSM_API_KEY", "osmapi-key") |
| monkeypatch.setenv("GEMINI_API_KEY_2", "gemini-key") |
| run_eval = _load_run_eval_module() |
|
|
| captured = {} |
|
|
| class FakeAsyncOpenAI: |
| def __init__(self, **kwargs): |
| captured["openai_kwargs"] = kwargs |
|
|
| def fake_llm_factory(model, **kwargs): |
| captured["llm_factory_model"] = model |
| captured["llm_factory_kwargs"] = kwargs |
| return SimpleNamespace( |
| model_args={"temperature": 0.01, "top_p": 0.1, "max_tokens": 1024} |
| ) |
|
|
| class FakeGoogleEmbeddings: |
| def __init__(self, **kwargs): |
| captured["embeddings_kwargs"] = kwargs |
|
|
| class FakeGenAIClient: |
| def __init__(self, **kwargs): |
| captured["genai_kwargs"] = kwargs |
|
|
| import openai |
| import ragas.llms |
| import ragas.embeddings |
| import google |
|
|
| monkeypatch.setattr(openai, "AsyncOpenAI", FakeAsyncOpenAI) |
| monkeypatch.setattr(ragas.llms, "llm_factory", fake_llm_factory) |
| monkeypatch.setattr(ragas.embeddings, "GoogleEmbeddings", FakeGoogleEmbeddings) |
| monkeypatch.setattr(google, "genai", types.SimpleNamespace(Client=FakeGenAIClient), raising=False) |
|
|
| judge_llm, judge_embeddings = run_eval.build_judge() |
|
|
| assert isinstance(judge_embeddings, FakeGoogleEmbeddings) |
| assert captured["llm_factory_model"] == "qwen3.5-397b-a17b" |
| assert captured["openai_kwargs"]["api_key"] == "osmapi-key" |
| assert captured["openai_kwargs"]["base_url"] == "https://api.osmapi.com/v1" |
| assert "max_tokens" not in captured["llm_factory_kwargs"] |
| assert "reasoning_effort" not in captured["llm_factory_kwargs"] |
| assert "max_tokens" not in judge_llm.model_args |
|
|
|
|
| def test_get_judge_config_reads_current_env_at_call_time(monkeypatch): |
| run_eval = _load_run_eval_module() |
| monkeypatch.setenv("JUDGE_PROVIDER", "gemini") |
| monkeypatch.setenv("JUDGE_MODEL", "gemini/gemini-3.1-flash-lite-preview") |
|
|
| provider, model = run_eval._get_judge_config() |
|
|
| assert provider == "gemini" |
| assert model == "gemini/gemini-3.1-flash-lite-preview" |
|
|
|
|
| def test_get_judge_config_prefixes_bare_gemini_model(monkeypatch): |
| run_eval = _load_run_eval_module() |
| monkeypatch.setenv("JUDGE_PROVIDER", "gemini") |
| monkeypatch.setenv("JUDGE_MODEL", "gemini-3.1-flash-lite-preview") |
|
|
| provider, model = run_eval._get_judge_config() |
|
|
| assert provider == "gemini" |
| assert model == "gemini/gemini-3.1-flash-lite-preview" |
|
|
|
|
| def test_get_judge_config_defaults_to_groq_when_env_missing(monkeypatch): |
| monkeypatch.setenv("JUDGE_PROVIDER", "") |
| monkeypatch.setenv("JUDGE_MODEL", "") |
| run_eval = _load_run_eval_module() |
|
|
| provider, model = run_eval._get_judge_config() |
|
|
| assert provider == "groq" |
| assert model == "llama-3.3-70b-versatile" |
|
|
|
|
| def test_get_judge_config_infers_openrouter_provider_from_model_prefix(monkeypatch): |
| monkeypatch.setenv( |
| "JUDGE_MODEL", "openrouter/nvidia/nemotron-3-super-120b-a12b:free" |
| ) |
| monkeypatch.setenv("JUDGE_PROVIDER", "") |
| run_eval = _load_run_eval_module() |
|
|
| provider, model = run_eval._get_judge_config() |
|
|
| assert provider == "openrouter" |
| assert model == "nvidia/nemotron-3-super-120b-a12b:free" |
|
|
|
|
| def test_build_judge_uses_litellm_router_for_gemini(monkeypatch): |
| monkeypatch.setenv("JUDGE_MODEL", "gemini/gemini-3.1-flash-lite-preview") |
| monkeypatch.setenv("GEMINI_API_KEY_2", "gemini-key") |
| monkeypatch.setenv("JUDGE_PROVIDER", "") |
| run_eval = _load_run_eval_module() |
|
|
| captured = {} |
|
|
| def fake_llm_factory(model, **kwargs): |
| captured["llm_factory_model"] = model |
| captured["llm_factory_kwargs"] = kwargs |
| return "gemini-judge" |
|
|
| class FakeGoogleEmbeddings: |
| def __init__(self, **kwargs): |
| captured["embeddings_kwargs"] = kwargs |
|
|
| class FakeGenAIClient: |
| def __init__(self, **kwargs): |
| captured["genai_kwargs"] = kwargs |
|
|
| import litellm |
| import ragas.llms |
| import ragas.embeddings |
| import google |
|
|
| def fail_openai(**kwargs): |
| raise TypeError(f"OpenAI should not be used for Gemini judge: {kwargs}") |
|
|
| monkeypatch.setattr(litellm, "OpenAI", fail_openai) |
| monkeypatch.setattr(ragas.llms, "llm_factory", fake_llm_factory) |
| monkeypatch.setattr(ragas.embeddings, "GoogleEmbeddings", FakeGoogleEmbeddings) |
| monkeypatch.setattr(google, "genai", types.SimpleNamespace(Client=FakeGenAIClient), raising=False) |
|
|
| judge_llm, judge_embeddings = run_eval.build_judge() |
|
|
| assert judge_llm == "gemini-judge" |
| assert isinstance(judge_embeddings, FakeGoogleEmbeddings) |
| assert captured["llm_factory_model"] == "gemini/gemini-3.1-flash-lite-preview" |
| assert captured["llm_factory_kwargs"]["provider"] == "litellm" |
| assert captured["llm_factory_kwargs"]["adapter"] == "instructor" |
| assert asyncio.iscoroutinefunction(captured["llm_factory_kwargs"]["client"]) |
|
|
|
|
| def test_build_judge_uses_groq_with_secondary_key_when_provider_set(monkeypatch): |
| monkeypatch.setenv("JUDGE_PROVIDER", "groq") |
| monkeypatch.setenv("JUDGE_MODEL", "llama-3.3-70b-versatile") |
| monkeypatch.setenv("GROQ_API_KEY_2", "groq-secondary-key") |
| monkeypatch.setenv("GEMINI_API_KEY_2", "gemini-key") |
| run_eval = _load_run_eval_module() |
|
|
| captured = {} |
|
|
| class FakeAsyncOpenAI: |
| def __init__(self, **kwargs): |
| captured["openai_kwargs"] = kwargs |
|
|
| def fake_llm_factory(model, **kwargs): |
| captured["llm_factory_model"] = model |
| captured["llm_factory_kwargs"] = kwargs |
| return "groq-judge" |
|
|
| class FakeGoogleEmbeddings: |
| def __init__(self, **kwargs): |
| captured["embeddings_kwargs"] = kwargs |
|
|
| class FakeGenAIClient: |
| def __init__(self, **kwargs): |
| captured["genai_kwargs"] = kwargs |
|
|
| import openai |
| import ragas.llms |
| import ragas.embeddings |
| import google |
|
|
| monkeypatch.setattr(openai, "AsyncOpenAI", FakeAsyncOpenAI) |
| monkeypatch.setattr(ragas.llms, "llm_factory", fake_llm_factory) |
| monkeypatch.setattr(ragas.embeddings, "GoogleEmbeddings", FakeGoogleEmbeddings) |
| monkeypatch.setattr(google, "genai", types.SimpleNamespace(Client=FakeGenAIClient), raising=False) |
|
|
| judge_llm, judge_embeddings = run_eval.build_judge() |
|
|
| assert judge_llm == "groq-judge" |
| assert isinstance(judge_embeddings, FakeGoogleEmbeddings) |
| assert captured["llm_factory_model"] == "llama-3.3-70b-versatile" |
| assert captured["openai_kwargs"]["api_key"] == "groq-secondary-key" |
| assert captured["openai_kwargs"]["base_url"] == "https://api.groq.com/openai/v1" |
| assert "reasoning_effort" not in captured["llm_factory_kwargs"] |
|
|
|
|
| def test_build_judge_infers_groq_provider_from_model_prefix(monkeypatch): |
| monkeypatch.setenv("JUDGE_MODEL", "groq/llama-3.3-70b-versatile") |
| monkeypatch.setenv("GROQ_API_KEY_2", "groq-secondary-key") |
| monkeypatch.setenv("GEMINI_API_KEY_2", "gemini-key") |
| monkeypatch.setenv("JUDGE_PROVIDER", "") |
| run_eval = _load_run_eval_module() |
|
|
| captured = {} |
|
|
| class FakeAsyncOpenAI: |
| def __init__(self, **kwargs): |
| captured["openai_kwargs"] = kwargs |
|
|
| def fake_llm_factory(model, **kwargs): |
| captured["llm_factory_model"] = model |
| return "groq-judge" |
|
|
| class FakeGoogleEmbeddings: |
| def __init__(self, **kwargs): |
| captured["embeddings_kwargs"] = kwargs |
|
|
| class FakeGenAIClient: |
| def __init__(self, **kwargs): |
| captured["genai_kwargs"] = kwargs |
|
|
| import openai |
| import ragas.llms |
| import ragas.embeddings |
| import google |
|
|
| monkeypatch.setattr(openai, "AsyncOpenAI", FakeAsyncOpenAI) |
| monkeypatch.setattr(ragas.llms, "llm_factory", fake_llm_factory) |
| monkeypatch.setattr(ragas.embeddings, "GoogleEmbeddings", FakeGoogleEmbeddings) |
| monkeypatch.setattr(google, "genai", types.SimpleNamespace(Client=FakeGenAIClient), raising=False) |
|
|
| judge_llm, judge_embeddings = run_eval.build_judge() |
|
|
| assert judge_llm == "groq-judge" |
| assert isinstance(judge_embeddings, FakeGoogleEmbeddings) |
| assert captured["llm_factory_model"] == "llama-3.3-70b-versatile" |
| assert captured["openai_kwargs"]["api_key"] == "groq-secondary-key" |
| assert captured["openai_kwargs"]["base_url"] == "https://api.groq.com/openai/v1" |
|
|
|
|
| def test_build_judge_uses_openrouter_with_secondary_key(monkeypatch): |
| monkeypatch.setenv("JUDGE_PROVIDER", "openrouter") |
| monkeypatch.setenv("JUDGE_MODEL", "nvidia/nemotron-3-super-120b-a12b:free") |
| monkeypatch.setenv("OPENROUTER_API_KEY_2", "openrouter-secondary-key") |
| monkeypatch.setenv("GEMINI_API_KEY_2", "gemini-key") |
| run_eval = _load_run_eval_module() |
|
|
| captured = {} |
|
|
| class FakeAsyncOpenAI: |
| def __init__(self, **kwargs): |
| captured["openai_kwargs"] = kwargs |
|
|
| def fake_llm_factory(model, **kwargs): |
| captured["llm_factory_model"] = model |
| captured["llm_factory_kwargs"] = kwargs |
| return "openrouter-judge" |
|
|
| class FakeGoogleEmbeddings: |
| def __init__(self, **kwargs): |
| captured["embeddings_kwargs"] = kwargs |
|
|
| class FakeGenAIClient: |
| def __init__(self, **kwargs): |
| captured["genai_kwargs"] = kwargs |
|
|
| import openai |
| import ragas.llms |
| import ragas.embeddings |
| import google |
|
|
| monkeypatch.setattr(openai, "AsyncOpenAI", FakeAsyncOpenAI) |
| monkeypatch.setattr(ragas.llms, "llm_factory", fake_llm_factory) |
| monkeypatch.setattr(ragas.embeddings, "GoogleEmbeddings", FakeGoogleEmbeddings) |
| monkeypatch.setattr(google, "genai", types.SimpleNamespace(Client=FakeGenAIClient), raising=False) |
|
|
| judge_llm, judge_embeddings = run_eval.build_judge() |
|
|
| assert judge_llm == "openrouter-judge" |
| assert isinstance(judge_embeddings, FakeGoogleEmbeddings) |
| assert captured["llm_factory_model"] == "nvidia/nemotron-3-super-120b-a12b:free" |
| assert captured["openai_kwargs"]["api_key"] == "openrouter-secondary-key" |
| assert captured["openai_kwargs"]["base_url"] == "https://openrouter.ai/api/v1" |
| assert "reasoning_effort" not in captured["llm_factory_kwargs"] |
|
|
|
|
| def test_build_judge_uses_nvidia_with_disable_thinking(monkeypatch): |
| monkeypatch.setenv("JUDGE_PROVIDER", "nvidia") |
| monkeypatch.setenv("JUDGE_MODEL", "z-ai/glm4.7") |
| monkeypatch.setenv("NVIDIA_API_KEY_2", "nvapi-test-key") |
| monkeypatch.setenv("GEMINI_API_KEY_2", "gemini-key") |
| run_eval = _load_run_eval_module() |
|
|
| captured = {} |
|
|
| class FakeAsyncOpenAI: |
| def __init__(self, **kwargs): |
| captured["openai_kwargs"] = kwargs |
|
|
| def fake_llm_factory(model, **kwargs): |
| captured["llm_factory_model"] = model |
| captured["llm_factory_kwargs"] = kwargs |
| return "nvidia-judge" |
|
|
| class FakeGoogleEmbeddings: |
| def __init__(self, **kwargs): |
| captured["embeddings_kwargs"] = kwargs |
|
|
| class FakeGenAIClient: |
| def __init__(self, **kwargs): |
| captured["genai_kwargs"] = kwargs |
|
|
| import openai |
| import ragas.llms |
| import ragas.embeddings |
| import google |
|
|
| monkeypatch.setattr(openai, "AsyncOpenAI", FakeAsyncOpenAI) |
| monkeypatch.setattr(ragas.llms, "llm_factory", fake_llm_factory) |
| monkeypatch.setattr(ragas.embeddings, "GoogleEmbeddings", FakeGoogleEmbeddings) |
| monkeypatch.setattr(google, "genai", types.SimpleNamespace(Client=FakeGenAIClient), raising=False) |
|
|
| judge_llm, judge_embeddings = run_eval.build_judge() |
|
|
| assert judge_llm == "nvidia-judge" |
| assert isinstance(judge_embeddings, FakeGoogleEmbeddings) |
| assert captured["llm_factory_model"] == "z-ai/glm4.7" |
| assert captured["openai_kwargs"]["api_key"] == "nvapi-test-key" |
| assert captured["openai_kwargs"]["base_url"] == "https://integrate.api.nvidia.com/v1" |
| assert captured["llm_factory_kwargs"]["max_tokens"] == 16384 |
| |
| http_client = captured["openai_kwargs"].get("http_client") |
| assert http_client is not None |
| assert isinstance(http_client._transport, run_eval._DisableThinkingTransport) |
|
|
|
|
| def test_retry_delay_seconds_extracts_provider_hint(): |
| run_eval = _load_run_eval_module() |
|
|
| error = "quota exceeded. Please retry in 46.62650982s." |
|
|
| assert run_eval._retry_delay_seconds(error) == 47 |
|
|
|
|
| def test_retry_delay_seconds_has_reasonable_floor_for_quota_errors(): |
| run_eval = _load_run_eval_module() |
|
|
| error = "RESOURCE_EXHAUSTED quota exceeded for input_token_count" |
|
|
| assert run_eval._retry_delay_seconds(error) == 60 |
|
|
|
|
| def test_prepare_metric_row_truncates_long_fields(monkeypatch): |
| run_eval = _load_run_eval_module() |
| monkeypatch.setattr(run_eval, "RAGAS_MAX_CONTEXTS", 2, raising=False) |
| monkeypatch.setattr(run_eval, "RAGAS_CONTEXT_CHAR_LIMIT", 5, raising=False) |
| monkeypatch.setattr(run_eval, "RAGAS_ANSWER_CHAR_LIMIT", 6, raising=False) |
| monkeypatch.setattr(run_eval, "RAGAS_REFERENCE_CHAR_LIMIT", 7, raising=False) |
|
|
| row = { |
| "query": "What is section 3?", |
| "answer": "ABCDEFGHIJK", |
| "contexts": ["123456", "abcdef", "zzzzzz"], |
| "ground_truth": "reference-text", |
| } |
|
|
| prepared = run_eval._prepare_metric_row(row) |
|
|
| assert prepared["answer"] == "ABCDEF" |
| assert prepared["contexts"] == ["12345", "abcde"] |
| assert prepared["ground_truth"] == "referen" |
|
|
|
|
| def test_run_phase2_reuses_completed_checkpoint_rows(monkeypatch): |
| run_eval = _load_run_eval_module() |
| output_path = Path(__file__).resolve().parents[2] / "eval_results.test.json" |
| monkeypatch.setattr(run_eval, "OUTPUT_PATH", output_path) |
| output_path.unlink(missing_ok=True) |
|
|
| row = { |
| "id": "CASE-001", |
| "phase1_schema_version": run_eval.PHASE1_SCHEMA_VERSION, |
| "jurisdiction": "CENTRAL", |
| "query_type": "fact_lookup", |
| "query": "What are promoter duties?", |
| "ground_truth": "Promoters have statutory duties.", |
| "answer": "A real generated answer.", |
| "contexts": ["Section text"], |
| "citations_count": 1, |
| "confidence_score": 0.8, |
| "query_type_resolved": "fact_lookup", |
| "latency_ms": 20.0, |
| "error": None, |
| } |
| scored = { |
| **row, |
| "faithfulness": 0.9, |
| "answer_relevancy": 0.8, |
| "context_precision": 0.7, |
| "pass": True, |
| } |
|
|
| output_path.write_text( |
| json.dumps({"rows": [scored]}, indent=2), |
| encoding="utf-8", |
| ) |
|
|
| def fail_build_judge(): |
| raise AssertionError("judge should not be built when checkpoint is reusable") |
|
|
| monkeypatch.setattr(run_eval, "build_judge", fail_build_judge) |
|
|
| try: |
| results = run_eval.run_phase2([row], judge_llm=None, judge_embeddings=None) |
| assert results == [scored] |
| finally: |
| output_path.unlink(missing_ok=True) |
|
|