anky2002 commited on
Commit
89b6e5c
Β·
verified Β·
1 Parent(s): 5ed2449

Upload agents/utils.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. agents/utils.py +35 -13
agents/utils.py CHANGED
@@ -7,34 +7,56 @@ from typing import List, Dict, Any
7
  def compute_agent_confidence(scores: List[float]) -> float:
8
  """
9
  Compute agent confidence using agreement-vs-cancellation logic.
10
- Backported from semantic agent to ALL signal agents.
11
 
12
- Returns a confidence value between 0.1 and 1.0 that reflects:
13
- - High confidence when scores agree in direction and have high magnitude
14
- - Low confidence when scores cancel each other out
15
- - Low confidence when all scores are near zero (no signal)
16
  """
17
  if not scores:
18
  return 0.1
19
 
20
  avg = float(np.mean(scores))
 
21
 
22
  # Classify each score's direction
23
  signs = [1 if s > 0.05 else (-1 if s < -0.05 else 0) for s in scores]
24
  n_pos = sum(1 for s in signs if s > 0)
25
  n_neg = sum(1 for s in signs if s < 0)
26
  n_neu = sum(1 for s in signs if s == 0)
 
27
 
28
- if n_pos > 0 and n_neg > 0:
29
- # Scores cancel β€” low confidence
30
- agreement = max(n_pos, n_neg) / (n_pos + n_neg)
31
- return min(1.0, 0.15 + 0.5 * abs(avg) * agreement)
32
- elif n_neu == len(signs):
33
  # All genuinely neutral β€” low confidence
34
- return 0.2
 
 
 
 
 
 
 
 
 
35
  else:
36
- # Scores agree β€” confidence scales with magnitude
37
- return min(1.0, 0.3 + 0.6 * abs(avg))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
 
40
  def compute_failure_prob(n_ran: int, n_total: int, n_insufficient: int = 0) -> float:
 
7
  def compute_agent_confidence(scores: List[float]) -> float:
8
  """
9
  Compute agent confidence using agreement-vs-cancellation logic.
 
10
 
11
+ Key fix: consistent weak agreement across MANY tests compounds confidence.
12
+ 4 tests weakly agreeing is more confident than 1 test strongly agreeing.
 
 
13
  """
14
  if not scores:
15
  return 0.1
16
 
17
  avg = float(np.mean(scores))
18
+ n = len(scores)
19
 
20
  # Classify each score's direction
21
  signs = [1 if s > 0.05 else (-1 if s < -0.05 else 0) for s in scores]
22
  n_pos = sum(1 for s in signs if s > 0)
23
  n_neg = sum(1 for s in signs if s < 0)
24
  n_neu = sum(1 for s in signs if s == 0)
25
+ n_directional = n_pos + n_neg
26
 
27
+ if n_directional == 0:
 
 
 
 
28
  # All genuinely neutral β€” low confidence
29
+ return 0.15
30
+
31
+ if n_pos > 0 and n_neg > 0:
32
+ # Mixed signals β€” confidence based on majority strength
33
+ majority = max(n_pos, n_neg)
34
+ minority = min(n_pos, n_neg)
35
+ agreement_ratio = majority / n_directional
36
+
37
+ # Penalize cancellation but still give credit for majority direction
38
+ return min(1.0, 0.1 + 0.4 * abs(avg) * agreement_ratio)
39
  else:
40
+ # All directional scores agree β€” confidence compounds with count
41
+ # Key insight: 10 tests all saying -0.1 is more confident than 1 test saying -0.3
42
+ n_agreeing = max(n_pos, n_neg)
43
+
44
+ # Base confidence from magnitude
45
+ magnitude_conf = abs(avg)
46
+
47
+ # Agreement bonus: sqrt scaling so 4 tests β‰ˆ 2x confidence vs 1 test
48
+ agreement_bonus = min(1.0, np.sqrt(n_agreeing / 3.0)) # normalized: 3 agreeing = 1.0
49
+
50
+ # Combined: magnitude Γ— agreement, with floor at 0.2 for any agreement
51
+ conf = 0.2 + 0.7 * magnitude_conf * agreement_bonus
52
+
53
+ # Additional boost for broad agreement (many tests, not just a few)
54
+ if n_agreeing >= 5:
55
+ conf += 0.1
56
+ if n_agreeing >= 10:
57
+ conf += 0.1
58
+
59
+ return min(1.0, conf)
60
 
61
 
62
  def compute_failure_prob(n_ran: int, n_total: int, n_insufficient: int = 0) -> float: