Step 9: evaluator — per-criterion verdict with threshold safety rules
Browse filesImplements specs/09_evaluator.md. Combined confidence formula weighs LLM
confidence against OCR tier quality. Safety rules prevent silent disqualification:
not_eligible at medium confidence (0.55-0.80) is downgraded to needs_review.
Falls back to precomputed on LLMUnavailable.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- core/evaluator.py +152 -3
- specs/09_evaluator.md +134 -0
core/evaluator.py
CHANGED
|
@@ -1,9 +1,158 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
|
| 4 |
def evaluate(bidder_id: str, criterion: Criterion) -> Verdict:
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
|
| 8 |
def evaluate_bidder(bidder_id: str, criteria: list[Criterion]) -> list[Verdict]:
|
| 9 |
-
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from datetime import datetime, timezone
|
| 3 |
+
|
| 4 |
+
import streamlit as st
|
| 5 |
+
|
| 6 |
+
from core import audit, bidder_processor, fallback
|
| 7 |
+
from core.config import CONFIDENCE_HIGH, CONFIDENCE_REVIEW, MODEL_VERSION
|
| 8 |
+
from core.llm_client import LLM, LLMUnavailable
|
| 9 |
+
from core.prompts import EVALUATE_CRITERION_PROMPT_SYSTEM
|
| 10 |
+
from core.schemas import Criterion, Source, Verdict
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def _now_iso() -> str:
|
| 14 |
+
return datetime.now(timezone.utc).isoformat()
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@st.cache_resource
|
| 18 |
+
def _get_llm() -> LLM:
|
| 19 |
+
return LLM()
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def _combined_confidence(
|
| 23 |
+
llm_confidence: float, source_type: str, ocr_confidence: float | None
|
| 24 |
+
) -> float:
|
| 25 |
+
if source_type == "text_pdf":
|
| 26 |
+
return llm_confidence
|
| 27 |
+
elif source_type == "vision_llm":
|
| 28 |
+
return 0.7 * llm_confidence + 0.3 * 0.95
|
| 29 |
+
elif source_type == "tesseract":
|
| 30 |
+
tc = ocr_confidence if ocr_confidence and ocr_confidence >= 0 else 0.3
|
| 31 |
+
return 0.6 * llm_confidence + 0.4 * tc
|
| 32 |
+
return llm_confidence
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def _apply_thresholds(verdict: str, combined: float) -> str:
|
| 36 |
+
if verdict == "needs_review":
|
| 37 |
+
return "needs_review"
|
| 38 |
+
if combined >= CONFIDENCE_HIGH:
|
| 39 |
+
return verdict
|
| 40 |
+
if CONFIDENCE_REVIEW <= combined < CONFIDENCE_HIGH and verdict == "not_eligible":
|
| 41 |
+
return "needs_review"
|
| 42 |
+
if combined < CONFIDENCE_REVIEW:
|
| 43 |
+
return "needs_review"
|
| 44 |
+
return verdict
|
| 45 |
|
| 46 |
|
| 47 |
def evaluate(bidder_id: str, criterion: Criterion) -> Verdict:
|
| 48 |
+
evidence = bidder_processor.gather_evidence(bidder_id, criterion)
|
| 49 |
+
|
| 50 |
+
if not evidence:
|
| 51 |
+
v = Verdict(
|
| 52 |
+
bidder_id=bidder_id,
|
| 53 |
+
criterion_id=criterion.id,
|
| 54 |
+
verdict="needs_review",
|
| 55 |
+
reason="No matching evidence found in submitted documents.",
|
| 56 |
+
llm_confidence=0.0,
|
| 57 |
+
combined_confidence=0.0,
|
| 58 |
+
model_version=MODEL_VERSION,
|
| 59 |
+
timestamp=_now_iso(),
|
| 60 |
+
)
|
| 61 |
+
audit.log("criterion_evaluated", bidder_id=bidder_id,
|
| 62 |
+
criterion_id=criterion.id, verdict="needs_review",
|
| 63 |
+
combined_confidence=0.0)
|
| 64 |
+
return v
|
| 65 |
+
|
| 66 |
+
evidence_dicts = [
|
| 67 |
+
{
|
| 68 |
+
"doc_name": e.doc_name,
|
| 69 |
+
"page": e.page,
|
| 70 |
+
"ocr_confidence": e.ocr_confidence,
|
| 71 |
+
"source_type": e.source_type,
|
| 72 |
+
"text": e.text[:1500],
|
| 73 |
+
}
|
| 74 |
+
for e in evidence
|
| 75 |
+
]
|
| 76 |
+
|
| 77 |
+
user_prompt = f"""CRITERION:
|
| 78 |
+
{criterion.model_dump_json(indent=2)}
|
| 79 |
+
|
| 80 |
+
RETRIEVED EVIDENCE (top-k chunks from bidder {bidder_id}):
|
| 81 |
+
{json.dumps(evidence_dicts, indent=2)}
|
| 82 |
+
|
| 83 |
+
Return JSON:
|
| 84 |
+
{{
|
| 85 |
+
"verdict": "eligible" | "not_eligible" | "needs_review",
|
| 86 |
+
"extracted_value": "<short string as found in evidence>",
|
| 87 |
+
"normalized_value": <number or null>,
|
| 88 |
+
"chosen_source": {{"doc_name": "...", "page": <int>, "snippet": "<= 200 chars", "source_type": "..."}},
|
| 89 |
+
"llm_confidence": <0.0 to 1.0>,
|
| 90 |
+
"reason": "<one or two sentences>"
|
| 91 |
+
}}
|
| 92 |
+
|
| 93 |
+
Rules:
|
| 94 |
+
- If evidence directly contains a value satisfying the rule, verdict=eligible with high llm_confidence.
|
| 95 |
+
- If evidence directly contradicts the rule, verdict=not_eligible.
|
| 96 |
+
- If no relevant evidence retrieved, verdict=needs_review, llm_confidence<=0.4.
|
| 97 |
+
- If the source is OCR with low confidence and the value is borderline, lean to needs_review.
|
| 98 |
+
"""
|
| 99 |
+
|
| 100 |
+
try:
|
| 101 |
+
llm = _get_llm()
|
| 102 |
+
result = llm.chat_json(EVALUATE_CRITERION_PROMPT_SYSTEM, user_prompt)
|
| 103 |
+
except LLMUnavailable:
|
| 104 |
+
audit.log("precomputed_fallback_used", bidder_id=bidder_id,
|
| 105 |
+
criterion_id=criterion.id, reason="LLMUnavailable in evaluate")
|
| 106 |
+
if "fallback_active" not in st.session_state:
|
| 107 |
+
st.session_state["fallback_active"] = True
|
| 108 |
+
return fallback.load_evaluation(bidder_id, criterion.id)
|
| 109 |
+
|
| 110 |
+
llm_verdict = result.get("verdict", "needs_review")
|
| 111 |
+
extracted_value = result.get("extracted_value")
|
| 112 |
+
normalized_value = result.get("normalized_value")
|
| 113 |
+
chosen_src = result.get("chosen_source") or {}
|
| 114 |
+
llm_confidence = float(result.get("llm_confidence", 0.5))
|
| 115 |
+
reason = result.get("reason", "")
|
| 116 |
+
|
| 117 |
+
source_type = chosen_src.get("source_type", "text_pdf")
|
| 118 |
+
best_evidence = next(
|
| 119 |
+
(e for e in evidence if e.doc_name == chosen_src.get("doc_name")),
|
| 120 |
+
evidence[0] if evidence else None,
|
| 121 |
+
)
|
| 122 |
+
ocr_confidence = best_evidence.ocr_confidence if best_evidence else None
|
| 123 |
+
if ocr_confidence and ocr_confidence < 0:
|
| 124 |
+
ocr_confidence = None
|
| 125 |
+
|
| 126 |
+
source = Source(
|
| 127 |
+
doc_name=chosen_src.get("doc_name", ""),
|
| 128 |
+
page=int(chosen_src.get("page", 1)),
|
| 129 |
+
snippet=chosen_src.get("snippet", "")[:200],
|
| 130 |
+
source_type=source_type,
|
| 131 |
+
) if chosen_src else None
|
| 132 |
+
|
| 133 |
+
combined = _combined_confidence(llm_confidence, source_type, ocr_confidence)
|
| 134 |
+
final_verdict = _apply_thresholds(llm_verdict, combined)
|
| 135 |
+
|
| 136 |
+
v = Verdict(
|
| 137 |
+
bidder_id=bidder_id,
|
| 138 |
+
criterion_id=criterion.id,
|
| 139 |
+
verdict=final_verdict,
|
| 140 |
+
extracted_value=extracted_value,
|
| 141 |
+
normalized_value=normalized_value,
|
| 142 |
+
source=source,
|
| 143 |
+
llm_confidence=llm_confidence,
|
| 144 |
+
ocr_confidence=ocr_confidence,
|
| 145 |
+
combined_confidence=round(combined, 4),
|
| 146 |
+
reason=reason,
|
| 147 |
+
model_version=MODEL_VERSION,
|
| 148 |
+
timestamp=_now_iso(),
|
| 149 |
+
review_status="pending",
|
| 150 |
+
)
|
| 151 |
+
audit.log("criterion_evaluated", bidder_id=bidder_id,
|
| 152 |
+
criterion_id=criterion.id, verdict=final_verdict,
|
| 153 |
+
combined_confidence=round(combined, 4))
|
| 154 |
+
return v
|
| 155 |
|
| 156 |
|
| 157 |
def evaluate_bidder(bidder_id: str, criteria: list[Criterion]) -> list[Verdict]:
|
| 158 |
+
return [evaluate(bidder_id, c) for c in criteria]
|
specs/09_evaluator.md
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Spec 09 — Evaluator
|
| 2 |
+
|
| 3 |
+
**Step:** 9 of 15
|
| 4 |
+
**Time budget:** ~25 min
|
| 5 |
+
**Checkpoint:** `evaluate("bidder_a", c1)` returns eligible with high confidence; `evaluate("bidder_b", c1)` returns not_eligible.
|
| 6 |
+
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
## Goal
|
| 10 |
+
|
| 11 |
+
Implement `core/evaluator.py` — per-criterion verdict generation with combined confidence scoring and threshold-based safety rules.
|
| 12 |
+
|
| 13 |
+
---
|
| 14 |
+
|
| 15 |
+
## `evaluate(bidder_id: str, criterion: Criterion) -> Verdict`
|
| 16 |
+
|
| 17 |
+
### Step 1 — Gather evidence
|
| 18 |
+
|
| 19 |
+
`evidence = bidder_processor.gather_evidence(bidder_id, criterion)`
|
| 20 |
+
|
| 21 |
+
If empty: return immediately:
|
| 22 |
+
```python
|
| 23 |
+
Verdict(
|
| 24 |
+
bidder_id=bidder_id,
|
| 25 |
+
criterion_id=criterion.id,
|
| 26 |
+
verdict="needs_review",
|
| 27 |
+
reason="No matching evidence found in submitted documents.",
|
| 28 |
+
llm_confidence=0.0,
|
| 29 |
+
combined_confidence=0.0,
|
| 30 |
+
model_version=MODEL_VERSION,
|
| 31 |
+
timestamp=now_iso(),
|
| 32 |
+
)
|
| 33 |
+
```
|
| 34 |
+
Log `criterion_evaluated` with verdict=needs_review.
|
| 35 |
+
|
| 36 |
+
### Step 2 — Build LLM prompt
|
| 37 |
+
|
| 38 |
+
User message template:
|
| 39 |
+
```
|
| 40 |
+
CRITERION:
|
| 41 |
+
{criterion.model_dump_json(indent=2)}
|
| 42 |
+
|
| 43 |
+
RETRIEVED EVIDENCE (top-k chunks from bidder {bidder_id}):
|
| 44 |
+
{json list of evidence dicts with doc_name, page, ocr_confidence, source_type, text}
|
| 45 |
+
|
| 46 |
+
Return JSON:
|
| 47 |
+
{
|
| 48 |
+
"verdict": "eligible" | "not_eligible" | "needs_review",
|
| 49 |
+
"extracted_value": "<short string as found in evidence>",
|
| 50 |
+
"normalized_value": <number or null>,
|
| 51 |
+
"chosen_source": {"doc_name": "...", "page": <int>, "snippet": "<= 200 chars", "source_type": "..."},
|
| 52 |
+
"llm_confidence": <0.0 to 1.0>,
|
| 53 |
+
"reason": "<one or two sentences>"
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
Rules:
|
| 57 |
+
- If evidence directly contains a value satisfying the rule, verdict=eligible with high llm_confidence.
|
| 58 |
+
- If evidence directly contradicts the rule, verdict=not_eligible.
|
| 59 |
+
- If no relevant evidence retrieved, verdict=needs_review, llm_confidence<=0.4.
|
| 60 |
+
- If the source is OCR with low confidence and the value is borderline, lean to needs_review.
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
### Step 3 — Call LLM
|
| 64 |
+
|
| 65 |
+
`result = llm.chat_json(EVALUATE_CRITERION_PROMPT_SYSTEM, user_prompt)`
|
| 66 |
+
|
| 67 |
+
On `LLMUnavailable`: return `fallback.load_evaluation(bidder_id, criterion.id)`.
|
| 68 |
+
|
| 69 |
+
### Step 4 — Parse result
|
| 70 |
+
|
| 71 |
+
Extract: `verdict`, `extracted_value`, `normalized_value`, `chosen_source`, `llm_confidence`, `reason`.
|
| 72 |
+
|
| 73 |
+
Build `Source` object from `chosen_source`.
|
| 74 |
+
|
| 75 |
+
### Step 5 — Combined confidence
|
| 76 |
+
|
| 77 |
+
Find the evidence chunk matching `chosen_source` to get `ocr_confidence` and `source_type`:
|
| 78 |
+
|
| 79 |
+
```python
|
| 80 |
+
if source_type == "text_pdf":
|
| 81 |
+
combined = llm_confidence
|
| 82 |
+
elif source_type == "vision_llm":
|
| 83 |
+
combined = 0.7 * llm_confidence + 0.3 * 0.95
|
| 84 |
+
elif source_type == "tesseract":
|
| 85 |
+
tc = ocr_confidence if ocr_confidence and ocr_confidence >= 0 else 0.3
|
| 86 |
+
combined = 0.6 * llm_confidence + 0.4 * tc
|
| 87 |
+
else:
|
| 88 |
+
combined = llm_confidence
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
### Step 6 — Apply threshold safety rules (in order)
|
| 92 |
+
|
| 93 |
+
1. If LLM verdict is `needs_review` → keep.
|
| 94 |
+
2. If `combined >= CONFIDENCE_HIGH` → keep LLM verdict.
|
| 95 |
+
3. If `CONFIDENCE_REVIEW <= combined < CONFIDENCE_HIGH` AND verdict is `not_eligible` → downgrade to `needs_review` (NEVER silently disqualify at medium confidence).
|
| 96 |
+
4. If `combined < CONFIDENCE_REVIEW` → force `needs_review`.
|
| 97 |
+
|
| 98 |
+
### Step 7 — Build and return Verdict
|
| 99 |
+
|
| 100 |
+
```python
|
| 101 |
+
Verdict(
|
| 102 |
+
bidder_id=bidder_id,
|
| 103 |
+
criterion_id=criterion.id,
|
| 104 |
+
verdict=final_verdict,
|
| 105 |
+
extracted_value=extracted_value,
|
| 106 |
+
normalized_value=normalized_value,
|
| 107 |
+
source=source,
|
| 108 |
+
llm_confidence=llm_confidence,
|
| 109 |
+
ocr_confidence=ocr_confidence_from_best_evidence,
|
| 110 |
+
combined_confidence=combined,
|
| 111 |
+
reason=reason,
|
| 112 |
+
model_version=MODEL_VERSION,
|
| 113 |
+
timestamp=now_iso(),
|
| 114 |
+
review_status="pending",
|
| 115 |
+
)
|
| 116 |
+
```
|
| 117 |
+
|
| 118 |
+
Log `criterion_evaluated` to audit.
|
| 119 |
+
|
| 120 |
+
---
|
| 121 |
+
|
| 122 |
+
## `evaluate_bidder(bidder_id: str, criteria: list[Criterion]) -> list[Verdict]`
|
| 123 |
+
|
| 124 |
+
Calls `evaluate(bidder_id, c)` for each criterion in sequence. Returns list.
|
| 125 |
+
|
| 126 |
+
---
|
| 127 |
+
|
| 128 |
+
## Acceptance Criteria
|
| 129 |
+
|
| 130 |
+
1. `evaluate("bidder_a", c1)` → `verdict="eligible"`, `combined_confidence >= 0.8` (or fallback eligible).
|
| 131 |
+
2. `evaluate("bidder_b", c1)` → `verdict="not_eligible"` or `"needs_review"` (never silently eligible when turnover is below threshold).
|
| 132 |
+
3. `evaluate_bidder("bidder_a", criteria)` returns 5 verdicts.
|
| 133 |
+
4. All verdicts are `Verdict` instances with valid `review_status="pending"`.
|
| 134 |
+
5. Audit log gains `criterion_evaluated` entries.
|