JaydeepR Claude Sonnet 4.6 commited on
Commit
27760c8
·
1 Parent(s): a337229

Step 9: evaluator — per-criterion verdict with threshold safety rules

Browse files

Implements specs/09_evaluator.md. Combined confidence formula weighs LLM
confidence against OCR tier quality. Safety rules prevent silent disqualification:
not_eligible at medium confidence (0.55-0.80) is downgraded to needs_review.
Falls back to precomputed on LLMUnavailable.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (2) hide show
  1. core/evaluator.py +152 -3
  2. specs/09_evaluator.md +134 -0
core/evaluator.py CHANGED
@@ -1,9 +1,158 @@
1
- from core.schemas import Criterion, Verdict
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
 
4
  def evaluate(bidder_id: str, criterion: Criterion) -> Verdict:
5
- raise NotImplementedError
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
 
8
  def evaluate_bidder(bidder_id: str, criteria: list[Criterion]) -> list[Verdict]:
9
- raise NotImplementedError
 
1
+ import json
2
+ from datetime import datetime, timezone
3
+
4
+ import streamlit as st
5
+
6
+ from core import audit, bidder_processor, fallback
7
+ from core.config import CONFIDENCE_HIGH, CONFIDENCE_REVIEW, MODEL_VERSION
8
+ from core.llm_client import LLM, LLMUnavailable
9
+ from core.prompts import EVALUATE_CRITERION_PROMPT_SYSTEM
10
+ from core.schemas import Criterion, Source, Verdict
11
+
12
+
13
+ def _now_iso() -> str:
14
+ return datetime.now(timezone.utc).isoformat()
15
+
16
+
17
+ @st.cache_resource
18
+ def _get_llm() -> LLM:
19
+ return LLM()
20
+
21
+
22
+ def _combined_confidence(
23
+ llm_confidence: float, source_type: str, ocr_confidence: float | None
24
+ ) -> float:
25
+ if source_type == "text_pdf":
26
+ return llm_confidence
27
+ elif source_type == "vision_llm":
28
+ return 0.7 * llm_confidence + 0.3 * 0.95
29
+ elif source_type == "tesseract":
30
+ tc = ocr_confidence if ocr_confidence and ocr_confidence >= 0 else 0.3
31
+ return 0.6 * llm_confidence + 0.4 * tc
32
+ return llm_confidence
33
+
34
+
35
+ def _apply_thresholds(verdict: str, combined: float) -> str:
36
+ if verdict == "needs_review":
37
+ return "needs_review"
38
+ if combined >= CONFIDENCE_HIGH:
39
+ return verdict
40
+ if CONFIDENCE_REVIEW <= combined < CONFIDENCE_HIGH and verdict == "not_eligible":
41
+ return "needs_review"
42
+ if combined < CONFIDENCE_REVIEW:
43
+ return "needs_review"
44
+ return verdict
45
 
46
 
47
  def evaluate(bidder_id: str, criterion: Criterion) -> Verdict:
48
+ evidence = bidder_processor.gather_evidence(bidder_id, criterion)
49
+
50
+ if not evidence:
51
+ v = Verdict(
52
+ bidder_id=bidder_id,
53
+ criterion_id=criterion.id,
54
+ verdict="needs_review",
55
+ reason="No matching evidence found in submitted documents.",
56
+ llm_confidence=0.0,
57
+ combined_confidence=0.0,
58
+ model_version=MODEL_VERSION,
59
+ timestamp=_now_iso(),
60
+ )
61
+ audit.log("criterion_evaluated", bidder_id=bidder_id,
62
+ criterion_id=criterion.id, verdict="needs_review",
63
+ combined_confidence=0.0)
64
+ return v
65
+
66
+ evidence_dicts = [
67
+ {
68
+ "doc_name": e.doc_name,
69
+ "page": e.page,
70
+ "ocr_confidence": e.ocr_confidence,
71
+ "source_type": e.source_type,
72
+ "text": e.text[:1500],
73
+ }
74
+ for e in evidence
75
+ ]
76
+
77
+ user_prompt = f"""CRITERION:
78
+ {criterion.model_dump_json(indent=2)}
79
+
80
+ RETRIEVED EVIDENCE (top-k chunks from bidder {bidder_id}):
81
+ {json.dumps(evidence_dicts, indent=2)}
82
+
83
+ Return JSON:
84
+ {{
85
+ "verdict": "eligible" | "not_eligible" | "needs_review",
86
+ "extracted_value": "<short string as found in evidence>",
87
+ "normalized_value": <number or null>,
88
+ "chosen_source": {{"doc_name": "...", "page": <int>, "snippet": "<= 200 chars", "source_type": "..."}},
89
+ "llm_confidence": <0.0 to 1.0>,
90
+ "reason": "<one or two sentences>"
91
+ }}
92
+
93
+ Rules:
94
+ - If evidence directly contains a value satisfying the rule, verdict=eligible with high llm_confidence.
95
+ - If evidence directly contradicts the rule, verdict=not_eligible.
96
+ - If no relevant evidence retrieved, verdict=needs_review, llm_confidence<=0.4.
97
+ - If the source is OCR with low confidence and the value is borderline, lean to needs_review.
98
+ """
99
+
100
+ try:
101
+ llm = _get_llm()
102
+ result = llm.chat_json(EVALUATE_CRITERION_PROMPT_SYSTEM, user_prompt)
103
+ except LLMUnavailable:
104
+ audit.log("precomputed_fallback_used", bidder_id=bidder_id,
105
+ criterion_id=criterion.id, reason="LLMUnavailable in evaluate")
106
+ if "fallback_active" not in st.session_state:
107
+ st.session_state["fallback_active"] = True
108
+ return fallback.load_evaluation(bidder_id, criterion.id)
109
+
110
+ llm_verdict = result.get("verdict", "needs_review")
111
+ extracted_value = result.get("extracted_value")
112
+ normalized_value = result.get("normalized_value")
113
+ chosen_src = result.get("chosen_source") or {}
114
+ llm_confidence = float(result.get("llm_confidence", 0.5))
115
+ reason = result.get("reason", "")
116
+
117
+ source_type = chosen_src.get("source_type", "text_pdf")
118
+ best_evidence = next(
119
+ (e for e in evidence if e.doc_name == chosen_src.get("doc_name")),
120
+ evidence[0] if evidence else None,
121
+ )
122
+ ocr_confidence = best_evidence.ocr_confidence if best_evidence else None
123
+ if ocr_confidence and ocr_confidence < 0:
124
+ ocr_confidence = None
125
+
126
+ source = Source(
127
+ doc_name=chosen_src.get("doc_name", ""),
128
+ page=int(chosen_src.get("page", 1)),
129
+ snippet=chosen_src.get("snippet", "")[:200],
130
+ source_type=source_type,
131
+ ) if chosen_src else None
132
+
133
+ combined = _combined_confidence(llm_confidence, source_type, ocr_confidence)
134
+ final_verdict = _apply_thresholds(llm_verdict, combined)
135
+
136
+ v = Verdict(
137
+ bidder_id=bidder_id,
138
+ criterion_id=criterion.id,
139
+ verdict=final_verdict,
140
+ extracted_value=extracted_value,
141
+ normalized_value=normalized_value,
142
+ source=source,
143
+ llm_confidence=llm_confidence,
144
+ ocr_confidence=ocr_confidence,
145
+ combined_confidence=round(combined, 4),
146
+ reason=reason,
147
+ model_version=MODEL_VERSION,
148
+ timestamp=_now_iso(),
149
+ review_status="pending",
150
+ )
151
+ audit.log("criterion_evaluated", bidder_id=bidder_id,
152
+ criterion_id=criterion.id, verdict=final_verdict,
153
+ combined_confidence=round(combined, 4))
154
+ return v
155
 
156
 
157
  def evaluate_bidder(bidder_id: str, criteria: list[Criterion]) -> list[Verdict]:
158
+ return [evaluate(bidder_id, c) for c in criteria]
specs/09_evaluator.md ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Spec 09 — Evaluator
2
+
3
+ **Step:** 9 of 15
4
+ **Time budget:** ~25 min
5
+ **Checkpoint:** `evaluate("bidder_a", c1)` returns eligible with high confidence; `evaluate("bidder_b", c1)` returns not_eligible.
6
+
7
+ ---
8
+
9
+ ## Goal
10
+
11
+ Implement `core/evaluator.py` — per-criterion verdict generation with combined confidence scoring and threshold-based safety rules.
12
+
13
+ ---
14
+
15
+ ## `evaluate(bidder_id: str, criterion: Criterion) -> Verdict`
16
+
17
+ ### Step 1 — Gather evidence
18
+
19
+ `evidence = bidder_processor.gather_evidence(bidder_id, criterion)`
20
+
21
+ If empty: return immediately:
22
+ ```python
23
+ Verdict(
24
+ bidder_id=bidder_id,
25
+ criterion_id=criterion.id,
26
+ verdict="needs_review",
27
+ reason="No matching evidence found in submitted documents.",
28
+ llm_confidence=0.0,
29
+ combined_confidence=0.0,
30
+ model_version=MODEL_VERSION,
31
+ timestamp=now_iso(),
32
+ )
33
+ ```
34
+ Log `criterion_evaluated` with verdict=needs_review.
35
+
36
+ ### Step 2 — Build LLM prompt
37
+
38
+ User message template:
39
+ ```
40
+ CRITERION:
41
+ {criterion.model_dump_json(indent=2)}
42
+
43
+ RETRIEVED EVIDENCE (top-k chunks from bidder {bidder_id}):
44
+ {json list of evidence dicts with doc_name, page, ocr_confidence, source_type, text}
45
+
46
+ Return JSON:
47
+ {
48
+ "verdict": "eligible" | "not_eligible" | "needs_review",
49
+ "extracted_value": "<short string as found in evidence>",
50
+ "normalized_value": <number or null>,
51
+ "chosen_source": {"doc_name": "...", "page": <int>, "snippet": "<= 200 chars", "source_type": "..."},
52
+ "llm_confidence": <0.0 to 1.0>,
53
+ "reason": "<one or two sentences>"
54
+ }
55
+
56
+ Rules:
57
+ - If evidence directly contains a value satisfying the rule, verdict=eligible with high llm_confidence.
58
+ - If evidence directly contradicts the rule, verdict=not_eligible.
59
+ - If no relevant evidence retrieved, verdict=needs_review, llm_confidence<=0.4.
60
+ - If the source is OCR with low confidence and the value is borderline, lean to needs_review.
61
+ ```
62
+
63
+ ### Step 3 — Call LLM
64
+
65
+ `result = llm.chat_json(EVALUATE_CRITERION_PROMPT_SYSTEM, user_prompt)`
66
+
67
+ On `LLMUnavailable`: return `fallback.load_evaluation(bidder_id, criterion.id)`.
68
+
69
+ ### Step 4 — Parse result
70
+
71
+ Extract: `verdict`, `extracted_value`, `normalized_value`, `chosen_source`, `llm_confidence`, `reason`.
72
+
73
+ Build `Source` object from `chosen_source`.
74
+
75
+ ### Step 5 — Combined confidence
76
+
77
+ Find the evidence chunk matching `chosen_source` to get `ocr_confidence` and `source_type`:
78
+
79
+ ```python
80
+ if source_type == "text_pdf":
81
+ combined = llm_confidence
82
+ elif source_type == "vision_llm":
83
+ combined = 0.7 * llm_confidence + 0.3 * 0.95
84
+ elif source_type == "tesseract":
85
+ tc = ocr_confidence if ocr_confidence and ocr_confidence >= 0 else 0.3
86
+ combined = 0.6 * llm_confidence + 0.4 * tc
87
+ else:
88
+ combined = llm_confidence
89
+ ```
90
+
91
+ ### Step 6 — Apply threshold safety rules (in order)
92
+
93
+ 1. If LLM verdict is `needs_review` → keep.
94
+ 2. If `combined >= CONFIDENCE_HIGH` → keep LLM verdict.
95
+ 3. If `CONFIDENCE_REVIEW <= combined < CONFIDENCE_HIGH` AND verdict is `not_eligible` → downgrade to `needs_review` (NEVER silently disqualify at medium confidence).
96
+ 4. If `combined < CONFIDENCE_REVIEW` → force `needs_review`.
97
+
98
+ ### Step 7 — Build and return Verdict
99
+
100
+ ```python
101
+ Verdict(
102
+ bidder_id=bidder_id,
103
+ criterion_id=criterion.id,
104
+ verdict=final_verdict,
105
+ extracted_value=extracted_value,
106
+ normalized_value=normalized_value,
107
+ source=source,
108
+ llm_confidence=llm_confidence,
109
+ ocr_confidence=ocr_confidence_from_best_evidence,
110
+ combined_confidence=combined,
111
+ reason=reason,
112
+ model_version=MODEL_VERSION,
113
+ timestamp=now_iso(),
114
+ review_status="pending",
115
+ )
116
+ ```
117
+
118
+ Log `criterion_evaluated` to audit.
119
+
120
+ ---
121
+
122
+ ## `evaluate_bidder(bidder_id: str, criteria: list[Criterion]) -> list[Verdict]`
123
+
124
+ Calls `evaluate(bidder_id, c)` for each criterion in sequence. Returns list.
125
+
126
+ ---
127
+
128
+ ## Acceptance Criteria
129
+
130
+ 1. `evaluate("bidder_a", c1)` → `verdict="eligible"`, `combined_confidence >= 0.8` (or fallback eligible).
131
+ 2. `evaluate("bidder_b", c1)` → `verdict="not_eligible"` or `"needs_review"` (never silently eligible when turnover is below threshold).
132
+ 3. `evaluate_bidder("bidder_a", criteria)` returns 5 verdicts.
133
+ 4. All verdicts are `Verdict` instances with valid `review_status="pending"`.
134
+ 5. Audit log gains `criterion_evaluated` entries.