nkshirsa commited on
Commit
37b310e
·
verified ·
1 Parent(s): 07dc4a2

Add phd_research_os_v2/layer6/evaluator.py

Browse files
phd_research_os_v2/layer6/evaluator.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Layer 6: Evaluation Harness
3
+ ==============================
4
+ Regression gate, golden dataset management, quality metrics.
5
+ """
6
+
7
+ import json
8
+ import os
9
+ from pathlib import Path
10
+ from ..core.database import get_db, gen_id, now_iso, from_fixed
11
+
12
+
13
+ REGRESSION_THRESHOLDS = {
14
+ "min_extraction_recall": 0.70,
15
+ "max_hallucination_rate": 0.10,
16
+ "min_epistemic_accuracy": 0.60,
17
+ "min_qualifier_preservation": 0.50,
18
+ "min_null_detection_rate": 0.30,
19
+ }
20
+
21
+
22
+ class Evaluator:
23
+ """Evaluation harness with regression gate and quality metrics."""
24
+
25
+ def __init__(self, db_path: str = None, golden_path: str = "config/golden_dataset"):
26
+ self.db_path = db_path
27
+ self.golden_path = Path(golden_path)
28
+
29
+ def compute_system_metrics(self) -> dict:
30
+ """Compute current system-wide quality metrics."""
31
+ conn = get_db(self.db_path)
32
+
33
+ total = conn.execute("SELECT COUNT(*) FROM claims").fetchone()[0]
34
+ if total == 0:
35
+ conn.close()
36
+ return {"total_claims": 0, "message": "No claims to evaluate"}
37
+
38
+ # Epistemic tag distribution
39
+ tags = conn.execute(
40
+ "SELECT epistemic_tag, COUNT(*) FROM claims GROUP BY epistemic_tag"
41
+ ).fetchall()
42
+ tag_dist = {dict(t)["epistemic_tag"]: list(dict(t).values())[1] for t in tags}
43
+
44
+ # Status distribution
45
+ statuses = conn.execute(
46
+ "SELECT status, COUNT(*) FROM claims GROUP BY status"
47
+ ).fetchall()
48
+ status_dist = {dict(s)["status"]: list(dict(s).values())[1] for s in statuses}
49
+
50
+ # Null result count
51
+ null_count = conn.execute(
52
+ "SELECT COUNT(*) FROM claims WHERE is_null_result = 1"
53
+ ).fetchone()[0]
54
+
55
+ # Average confidence scores
56
+ avg_composite = conn.execute(
57
+ "SELECT AVG(composite_confidence) FROM claims WHERE composite_confidence IS NOT NULL"
58
+ ).fetchone()[0]
59
+ avg_evidence = conn.execute(
60
+ "SELECT AVG(evidence_quality) FROM claims WHERE evidence_quality IS NOT NULL"
61
+ ).fetchone()[0]
62
+
63
+ # Section distribution
64
+ sections = conn.execute(
65
+ "SELECT source_section, COUNT(*) FROM claims WHERE source_section IS NOT NULL GROUP BY source_section"
66
+ ).fetchall()
67
+ section_dist = {dict(s)["source_section"]: list(dict(s).values())[1] for s in sections}
68
+
69
+ # Qualifier stats
70
+ with_qualifiers = conn.execute(
71
+ "SELECT COUNT(*) FROM claims WHERE qualifiers IS NOT NULL AND qualifiers != '[]'"
72
+ ).fetchone()[0]
73
+
74
+ # Canonical dedup ratio
75
+ canonical_count = conn.execute("SELECT COUNT(*) FROM canonical_claims").fetchone()[0]
76
+ dedup_ratio = canonical_count / total if total > 0 else 0
77
+
78
+ conn.close()
79
+
80
+ return {
81
+ "total_claims": total,
82
+ "epistemic_distribution": tag_dist,
83
+ "status_distribution": status_dist,
84
+ "null_results": null_count,
85
+ "null_rate": round(null_count / total, 3) if total > 0 else 0,
86
+ "avg_composite_confidence": round(from_fixed(int(avg_composite or 0)), 3),
87
+ "avg_evidence_quality": round(from_fixed(int(avg_evidence or 0)), 3),
88
+ "section_distribution": section_dist,
89
+ "claims_with_qualifiers": with_qualifiers,
90
+ "qualifier_rate": round(with_qualifiers / total, 3) if total > 0 else 0,
91
+ "canonical_claims": canonical_count,
92
+ "dedup_ratio": round(dedup_ratio, 3),
93
+ }
94
+
95
+ def run_regression_gate(self, metrics: dict = None) -> dict:
96
+ """
97
+ Run regression gate against thresholds.
98
+ Returns pass/fail with details.
99
+ """
100
+ if metrics is None:
101
+ metrics = self.compute_system_metrics()
102
+
103
+ if metrics.get("total_claims", 0) == 0:
104
+ return {"passed": False, "reason": "No claims to evaluate", "checks": []}
105
+
106
+ checks = []
107
+ all_passed = True
108
+
109
+ # Check qualifier preservation
110
+ qual_rate = metrics.get("qualifier_rate", 0)
111
+ qual_pass = qual_rate >= REGRESSION_THRESHOLDS["min_qualifier_preservation"]
112
+ checks.append({
113
+ "name": "Qualifier preservation",
114
+ "value": qual_rate,
115
+ "threshold": REGRESSION_THRESHOLDS["min_qualifier_preservation"],
116
+ "passed": qual_pass,
117
+ })
118
+ if not qual_pass:
119
+ all_passed = False
120
+
121
+ # Check null detection
122
+ null_rate = metrics.get("null_rate", 0)
123
+ null_pass = null_rate >= REGRESSION_THRESHOLDS["min_null_detection_rate"] or metrics["total_claims"] < 50
124
+ checks.append({
125
+ "name": "Null result detection",
126
+ "value": null_rate,
127
+ "threshold": REGRESSION_THRESHOLDS["min_null_detection_rate"],
128
+ "passed": null_pass,
129
+ "note": "Skipped (< 50 claims)" if metrics["total_claims"] < 50 else None,
130
+ })
131
+
132
+ # Check epistemic diversity (should have at least 2 distinct tags)
133
+ tag_count = len(metrics.get("epistemic_distribution", {}))
134
+ diversity_pass = tag_count >= 2
135
+ checks.append({
136
+ "name": "Epistemic diversity",
137
+ "value": tag_count,
138
+ "threshold": 2,
139
+ "passed": diversity_pass,
140
+ })
141
+ if not diversity_pass:
142
+ all_passed = False
143
+
144
+ # Log the eval run
145
+ conn = get_db(self.db_path)
146
+ run_id = gen_id("EVAL")
147
+ conn.execute("""
148
+ INSERT INTO eval_runs (run_id, run_type, metrics, passed, pipeline_version, created_at)
149
+ VALUES (?, 'regression', ?, ?, '2.1.0', ?)
150
+ """, (run_id, json.dumps({"metrics": metrics, "checks": checks}),
151
+ int(all_passed), now_iso()))
152
+ conn.commit()
153
+ conn.close()
154
+
155
+ return {
156
+ "passed": all_passed,
157
+ "run_id": run_id,
158
+ "checks": checks,
159
+ "metrics_summary": {
160
+ "total_claims": metrics["total_claims"],
161
+ "avg_confidence": metrics.get("avg_composite_confidence", 0),
162
+ "qualifier_rate": qual_rate,
163
+ "null_rate": null_rate,
164
+ }
165
+ }