SRE: regression test for all 5 vulnerability patches
Browse files- tests/test_sre_regression.py +162 -0
tests/test_sre_regression.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
SRE Regression Tests β Verify all 5 critical vulnerability patches work.
|
| 4 |
+
|
| 5 |
+
These test the EXACT failure scenarios from the SRE audit:
|
| 6 |
+
S1: Dict iteration during modification (MemoryStore)
|
| 7 |
+
S2: UNKNOWN action propagation (Actor)
|
| 8 |
+
S3: Context overflow from heuristic bloat (Actor prompt)
|
| 9 |
+
S4: Race condition in parallel swarm (ExperienceReplay)
|
| 10 |
+
S5: None score crash in trajectory math (Trajectory)
|
| 11 |
+
"""
|
| 12 |
+
import sys, os, json, threading
|
| 13 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
| 14 |
+
|
| 15 |
+
PASS = FAIL = 0
|
| 16 |
+
def check(name, cond, detail=""):
|
| 17 |
+
global PASS, FAIL
|
| 18 |
+
PASS += int(cond); FAIL += int(not cond)
|
| 19 |
+
print(f" {'β' if cond else 'β'} {name}" + (f": {detail}" if detail and not cond else ""))
|
| 20 |
+
|
| 21 |
+
# Import triggers sre_patches.apply_all()
|
| 22 |
+
import purpose_agent as pa
|
| 23 |
+
from purpose_agent.types import State, Action, Trajectory, TrajectoryStep, PurposeScore, Heuristic, MemoryTier
|
| 24 |
+
from purpose_agent.memory import MemoryStore, MemoryCard, MemoryKind, MemoryStatus
|
| 25 |
+
from purpose_agent.v2_types import MemoryScope
|
| 26 |
+
|
| 27 |
+
print("βββ SRE Scenario 1: Dict Snapshot During Iteration βββ")
|
| 28 |
+
store = MemoryStore()
|
| 29 |
+
for i in range(100):
|
| 30 |
+
store.add(MemoryCard(kind=MemoryKind.SKILL_CARD, status=MemoryStatus.PROMOTED,
|
| 31 |
+
pattern=f"p{i}", strategy=f"s{i}"))
|
| 32 |
+
|
| 33 |
+
# Simulate concurrent modification during retrieval
|
| 34 |
+
def modify_during_retrieve():
|
| 35 |
+
"""Add cards while retrieve is iterating β should NOT crash."""
|
| 36 |
+
for i in range(100, 200):
|
| 37 |
+
store.add(MemoryCard(kind=MemoryKind.SKILL_CARD, status=MemoryStatus.PROMOTED,
|
| 38 |
+
pattern=f"p{i}", strategy=f"s{i}"))
|
| 39 |
+
|
| 40 |
+
t = threading.Thread(target=modify_during_retrieve)
|
| 41 |
+
t.start()
|
| 42 |
+
try:
|
| 43 |
+
results = store.retrieve("test query", top_k=10)
|
| 44 |
+
check("S1 No RuntimeError during concurrent modify", True)
|
| 45 |
+
except RuntimeError as e:
|
| 46 |
+
check("S1 No RuntimeError during concurrent modify", False, str(e))
|
| 47 |
+
t.join()
|
| 48 |
+
|
| 49 |
+
print("\nβββ SRE Scenario 2: UNKNOWN Action Rejection βββ")
|
| 50 |
+
mock = pa.MockLLMBackend()
|
| 51 |
+
# Return garbage that parser can't handle
|
| 52 |
+
mock.register_handler("goal-directed agent", "totally unparseable garbage !@#$%")
|
| 53 |
+
mock.set_structured_default({"phi_before":0,"phi_after":0,"reasoning":"x","evidence":"x","confidence":0.5})
|
| 54 |
+
|
| 55 |
+
from purpose_agent.orchestrator import SimpleEnvironment
|
| 56 |
+
env = SimpleEnvironment(execute_fn=lambda a,s: State(data={"x":1}))
|
| 57 |
+
orch = pa.Orchestrator(llm=mock, environment=env, available_actions={"test":"test","DONE":"done"}, critic_mode="standard")
|
| 58 |
+
|
| 59 |
+
try:
|
| 60 |
+
r = orch.run_task(purpose="test", max_steps=2)
|
| 61 |
+
# Should get DONE (from UNKNOWN rejection) not crash
|
| 62 |
+
check("S2 No crash on garbage LLM output", True)
|
| 63 |
+
# The action should have been converted to DONE
|
| 64 |
+
if r.trajectory.steps:
|
| 65 |
+
last_action = r.trajectory.steps[-1].action.name
|
| 66 |
+
check("S2 UNKNOWN β DONE fallback", last_action == "DONE", f"got {last_action}")
|
| 67 |
+
else:
|
| 68 |
+
check("S2 Has steps", False, "no steps recorded")
|
| 69 |
+
except Exception as e:
|
| 70 |
+
check("S2 No crash on garbage LLM output", False, f"{type(e).__name__}: {e}")
|
| 71 |
+
|
| 72 |
+
print("\nβββ SRE Scenario 3: Heuristic Cap (Context Overflow Prevention) βββ")
|
| 73 |
+
mock2 = pa.MockLLMBackend()
|
| 74 |
+
mock2.register_handler("goal-directed agent", json.dumps({"thought":"t","action":{"name":"DONE","params":{}},"expected_delta":"d"}))
|
| 75 |
+
mock2.set_structured_default({"phi_before":0,"phi_after":5,"reasoning":"r","evidence":"e","confidence":0.7})
|
| 76 |
+
|
| 77 |
+
env2 = SimpleEnvironment(execute_fn=lambda a,s: State(data={}))
|
| 78 |
+
orch2 = pa.Orchestrator(llm=mock2, environment=env2, available_actions={"DONE":"done"}, critic_mode="standard")
|
| 79 |
+
|
| 80 |
+
# Inject 200 heuristics (would overflow SLM context without cap)
|
| 81 |
+
for i in range(200):
|
| 82 |
+
orch2.optimizer.heuristic_library.append(Heuristic(
|
| 83 |
+
pattern=f"Pattern {i} " * 10, strategy=f"Strategy {i} " * 10,
|
| 84 |
+
steps=[], tier=MemoryTier.STRATEGIC, q_value=0.5 + i*0.001,
|
| 85 |
+
))
|
| 86 |
+
orch2.sync_memory()
|
| 87 |
+
|
| 88 |
+
# Build the prompt β should be capped, not 200 entries
|
| 89 |
+
prompt = orch2.actor._build_system_prompt()
|
| 90 |
+
# Count heuristic entries in prompt
|
| 91 |
+
heuristic_lines = [l for l in prompt.split("\n") if l.strip().startswith("- When:") or l.strip().startswith("- Pattern")]
|
| 92 |
+
check("S3 Heuristics capped", len(heuristic_lines) <= 10, f"got {len(heuristic_lines)} (should be β€10)")
|
| 93 |
+
check("S3 Prompt not massive", len(prompt) < 5000, f"prompt is {len(prompt)} chars")
|
| 94 |
+
|
| 95 |
+
print("\nβββ SRE Scenario 4: Thread-Safe ExperienceReplay βββ")
|
| 96 |
+
er = pa.ExperienceReplay(capacity=100)
|
| 97 |
+
errors = []
|
| 98 |
+
|
| 99 |
+
def add_many(start):
|
| 100 |
+
for i in range(50):
|
| 101 |
+
try:
|
| 102 |
+
t = Trajectory(task_description=f"task_{start}_{i}", purpose=f"p_{start}_{i}")
|
| 103 |
+
t.steps.append(TrajectoryStep(
|
| 104 |
+
state_before=State(data={}), action=Action(name="x"),
|
| 105 |
+
state_after=State(data={"i": i}),
|
| 106 |
+
score=PurposeScore(phi_before=0, phi_after=5, delta=5, reasoning="r", evidence="e", confidence=0.8),
|
| 107 |
+
))
|
| 108 |
+
er.add(t)
|
| 109 |
+
except Exception as e:
|
| 110 |
+
errors.append(str(e))
|
| 111 |
+
|
| 112 |
+
# 4 threads adding concurrently
|
| 113 |
+
threads = [threading.Thread(target=add_many, args=(j*100,)) for j in range(4)]
|
| 114 |
+
for t in threads: t.start()
|
| 115 |
+
for t in threads: t.join()
|
| 116 |
+
|
| 117 |
+
check("S4 No errors in concurrent add", len(errors) == 0, f"{len(errors)} errors")
|
| 118 |
+
check("S4 All items added", er.size > 0, f"size={er.size}")
|
| 119 |
+
|
| 120 |
+
print("\nβββ SRE Scenario 5: None Score Guard βββ")
|
| 121 |
+
t = Trajectory(task_description="test", purpose="test")
|
| 122 |
+
# Add steps with None scores (simulates HITL interrupt mid-eval)
|
| 123 |
+
t.steps.append(TrajectoryStep(
|
| 124 |
+
state_before=State(data={}), action=Action(name="x"),
|
| 125 |
+
state_after=State(data={}), score=None, # None score!
|
| 126 |
+
))
|
| 127 |
+
t.steps.append(TrajectoryStep(
|
| 128 |
+
state_before=State(data={}), action=Action(name="y"),
|
| 129 |
+
state_after=State(data={}),
|
| 130 |
+
score=PurposeScore(phi_before=0, phi_after=7, delta=7, reasoning="r", evidence="e", confidence=0.9),
|
| 131 |
+
))
|
| 132 |
+
|
| 133 |
+
try:
|
| 134 |
+
cr = t.cumulative_reward
|
| 135 |
+
check("S5 cumulative_reward with None score", isinstance(cr, float), f"got {cr}")
|
| 136 |
+
except TypeError as e:
|
| 137 |
+
check("S5 cumulative_reward with None score", False, str(e))
|
| 138 |
+
|
| 139 |
+
try:
|
| 140 |
+
td = t.total_delta
|
| 141 |
+
check("S5 total_delta with None score", isinstance(td, float), f"got {td}")
|
| 142 |
+
except TypeError as e:
|
| 143 |
+
check("S5 total_delta with None score", False, str(e))
|
| 144 |
+
|
| 145 |
+
try:
|
| 146 |
+
sr = t.success_rate
|
| 147 |
+
check("S5 success_rate with None score", isinstance(sr, float))
|
| 148 |
+
except TypeError as e:
|
| 149 |
+
check("S5 success_rate with None score", False, str(e))
|
| 150 |
+
|
| 151 |
+
try:
|
| 152 |
+
fp = t.final_phi
|
| 153 |
+
check("S5 final_phi with None score", fp == 7.0 or fp is not None, f"got {fp}")
|
| 154 |
+
except (TypeError, AttributeError) as e:
|
| 155 |
+
check("S5 final_phi with None score", False, str(e))
|
| 156 |
+
|
| 157 |
+
# βββ REPORT βββ
|
| 158 |
+
print(f"\n{'='*50}")
|
| 159 |
+
print(f" SRE Regression: {PASS} pass, {FAIL} fail")
|
| 160 |
+
print(f" {'ALL PASS β' if FAIL == 0 else f'{FAIL} FAILURES β CRITICAL'}")
|
| 161 |
+
print(f"{'='*50}")
|
| 162 |
+
sys.exit(0 if FAIL == 0 else 1)
|