Upload agents/utils.py with huggingface_hub
Browse files- agents/utils.py +35 -13
agents/utils.py
CHANGED
|
@@ -7,34 +7,56 @@ from typing import List, Dict, Any
|
|
| 7 |
def compute_agent_confidence(scores: List[float]) -> float:
|
| 8 |
"""
|
| 9 |
Compute agent confidence using agreement-vs-cancellation logic.
|
| 10 |
-
Backported from semantic agent to ALL signal agents.
|
| 11 |
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
- Low confidence when scores cancel each other out
|
| 15 |
-
- Low confidence when all scores are near zero (no signal)
|
| 16 |
"""
|
| 17 |
if not scores:
|
| 18 |
return 0.1
|
| 19 |
|
| 20 |
avg = float(np.mean(scores))
|
|
|
|
| 21 |
|
| 22 |
# Classify each score's direction
|
| 23 |
signs = [1 if s > 0.05 else (-1 if s < -0.05 else 0) for s in scores]
|
| 24 |
n_pos = sum(1 for s in signs if s > 0)
|
| 25 |
n_neg = sum(1 for s in signs if s < 0)
|
| 26 |
n_neu = sum(1 for s in signs if s == 0)
|
|
|
|
| 27 |
|
| 28 |
-
if
|
| 29 |
-
# Scores cancel β low confidence
|
| 30 |
-
agreement = max(n_pos, n_neg) / (n_pos + n_neg)
|
| 31 |
-
return min(1.0, 0.15 + 0.5 * abs(avg) * agreement)
|
| 32 |
-
elif n_neu == len(signs):
|
| 33 |
# All genuinely neutral β low confidence
|
| 34 |
-
return 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
else:
|
| 36 |
-
#
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
|
| 40 |
def compute_failure_prob(n_ran: int, n_total: int, n_insufficient: int = 0) -> float:
|
|
|
|
| 7 |
def compute_agent_confidence(scores: List[float]) -> float:
|
| 8 |
"""
|
| 9 |
Compute agent confidence using agreement-vs-cancellation logic.
|
|
|
|
| 10 |
|
| 11 |
+
Key fix: consistent weak agreement across MANY tests compounds confidence.
|
| 12 |
+
4 tests weakly agreeing is more confident than 1 test strongly agreeing.
|
|
|
|
|
|
|
| 13 |
"""
|
| 14 |
if not scores:
|
| 15 |
return 0.1
|
| 16 |
|
| 17 |
avg = float(np.mean(scores))
|
| 18 |
+
n = len(scores)
|
| 19 |
|
| 20 |
# Classify each score's direction
|
| 21 |
signs = [1 if s > 0.05 else (-1 if s < -0.05 else 0) for s in scores]
|
| 22 |
n_pos = sum(1 for s in signs if s > 0)
|
| 23 |
n_neg = sum(1 for s in signs if s < 0)
|
| 24 |
n_neu = sum(1 for s in signs if s == 0)
|
| 25 |
+
n_directional = n_pos + n_neg
|
| 26 |
|
| 27 |
+
if n_directional == 0:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
# All genuinely neutral β low confidence
|
| 29 |
+
return 0.15
|
| 30 |
+
|
| 31 |
+
if n_pos > 0 and n_neg > 0:
|
| 32 |
+
# Mixed signals β confidence based on majority strength
|
| 33 |
+
majority = max(n_pos, n_neg)
|
| 34 |
+
minority = min(n_pos, n_neg)
|
| 35 |
+
agreement_ratio = majority / n_directional
|
| 36 |
+
|
| 37 |
+
# Penalize cancellation but still give credit for majority direction
|
| 38 |
+
return min(1.0, 0.1 + 0.4 * abs(avg) * agreement_ratio)
|
| 39 |
else:
|
| 40 |
+
# All directional scores agree β confidence compounds with count
|
| 41 |
+
# Key insight: 10 tests all saying -0.1 is more confident than 1 test saying -0.3
|
| 42 |
+
n_agreeing = max(n_pos, n_neg)
|
| 43 |
+
|
| 44 |
+
# Base confidence from magnitude
|
| 45 |
+
magnitude_conf = abs(avg)
|
| 46 |
+
|
| 47 |
+
# Agreement bonus: sqrt scaling so 4 tests β 2x confidence vs 1 test
|
| 48 |
+
agreement_bonus = min(1.0, np.sqrt(n_agreeing / 3.0)) # normalized: 3 agreeing = 1.0
|
| 49 |
+
|
| 50 |
+
# Combined: magnitude Γ agreement, with floor at 0.2 for any agreement
|
| 51 |
+
conf = 0.2 + 0.7 * magnitude_conf * agreement_bonus
|
| 52 |
+
|
| 53 |
+
# Additional boost for broad agreement (many tests, not just a few)
|
| 54 |
+
if n_agreeing >= 5:
|
| 55 |
+
conf += 0.1
|
| 56 |
+
if n_agreeing >= 10:
|
| 57 |
+
conf += 0.1
|
| 58 |
+
|
| 59 |
+
return min(1.0, conf)
|
| 60 |
|
| 61 |
|
| 62 |
def compute_failure_prob(n_ran: int, n_total: int, n_insufficient: int = 0) -> float:
|