Spaces:

anky2002
/

FORENSIQ

Running

App Files Files Community

anky2002 commited on 17 days ago

Commit

89b6e5c

verified ·

1 Parent(s): 5ed2449

Upload agents/utils.py with huggingface_hub

Browse files

Files changed (1) hide show

agents/utils.py +35 -13

agents/utils.py CHANGED Viewed

@@ -7,34 +7,56 @@ from typing import List, Dict, Any
 def compute_agent_confidence(scores: List[float]) -> float:
     """
     Compute agent confidence using agreement-vs-cancellation logic.
-    Backported from semantic agent to ALL signal agents.
-    Returns a confidence value between 0.1 and 1.0 that reflects:
-    - High confidence when scores agree in direction and have high magnitude
-    - Low confidence when scores cancel each other out
-    - Low confidence when all scores are near zero (no signal)
     """
     if not scores:
         return 0.1
     avg = float(np.mean(scores))
     # Classify each score's direction
     signs = [1 if s > 0.05 else (-1 if s < -0.05 else 0) for s in scores]
     n_pos = sum(1 for s in signs if s > 0)
     n_neg = sum(1 for s in signs if s < 0)
     n_neu = sum(1 for s in signs if s == 0)
-    if n_pos > 0 and n_neg > 0:
-        # Scores cancel — low confidence
-        agreement = max(n_pos, n_neg) / (n_pos + n_neg)
-        return min(1.0, 0.15 + 0.5 * abs(avg) * agreement)
-    elif n_neu == len(signs):
         # All genuinely neutral — low confidence
-        return 0.2
     else:
-        # Scores agree — confidence scales with magnitude
-        return min(1.0, 0.3 + 0.6 * abs(avg))
 def compute_failure_prob(n_ran: int, n_total: int, n_insufficient: int = 0) -> float:

 def compute_agent_confidence(scores: List[float]) -> float:
     """
     Compute agent confidence using agreement-vs-cancellation logic.
+    Key fix: consistent weak agreement across MANY tests compounds confidence.
+    4 tests weakly agreeing is more confident than 1 test strongly agreeing.
     """
     if not scores:
         return 0.1
     avg = float(np.mean(scores))
+    n = len(scores)
     # Classify each score's direction
     signs = [1 if s > 0.05 else (-1 if s < -0.05 else 0) for s in scores]
     n_pos = sum(1 for s in signs if s > 0)
     n_neg = sum(1 for s in signs if s < 0)
     n_neu = sum(1 for s in signs if s == 0)
+    n_directional = n_pos + n_neg
+    if n_directional == 0:
         # All genuinely neutral — low confidence
+        return 0.15
+    if n_pos > 0 and n_neg > 0:
+        # Mixed signals — confidence based on majority strength
+        majority = max(n_pos, n_neg)
+        minority = min(n_pos, n_neg)
+        agreement_ratio = majority / n_directional
+        # Penalize cancellation but still give credit for majority direction
+        return min(1.0, 0.1 + 0.4 * abs(avg) * agreement_ratio)
     else:
+        # All directional scores agree — confidence compounds with count
+        # Key insight: 10 tests all saying -0.1 is more confident than 1 test saying -0.3
+        n_agreeing = max(n_pos, n_neg)
+        # Base confidence from magnitude
+        magnitude_conf = abs(avg)
+        # Agreement bonus: sqrt scaling so 4 tests ≈ 2x confidence vs 1 test
+        agreement_bonus = min(1.0, np.sqrt(n_agreeing / 3.0))  # normalized: 3 agreeing = 1.0
+        # Combined: magnitude × agreement, with floor at 0.2 for any agreement
+        conf = 0.2 + 0.7 * magnitude_conf * agreement_bonus
+        # Additional boost for broad agreement (many tests, not just a few)
+        if n_agreeing >= 5:
+            conf += 0.1
+        if n_agreeing >= 10:
+            conf += 0.1
+        return min(1.0, conf)
 def compute_failure_prob(n_ran: int, n_total: int, n_insufficient: int = 0) -> float: