Rohan03 commited on
Commit
0cb26a3
Β·
verified Β·
1 Parent(s): 563a647

SRE: regression test for all 5 vulnerability patches

Browse files
Files changed (1) hide show
  1. tests/test_sre_regression.py +162 -0
tests/test_sre_regression.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ SRE Regression Tests β€” Verify all 5 critical vulnerability patches work.
4
+
5
+ These test the EXACT failure scenarios from the SRE audit:
6
+ S1: Dict iteration during modification (MemoryStore)
7
+ S2: UNKNOWN action propagation (Actor)
8
+ S3: Context overflow from heuristic bloat (Actor prompt)
9
+ S4: Race condition in parallel swarm (ExperienceReplay)
10
+ S5: None score crash in trajectory math (Trajectory)
11
+ """
12
+ import sys, os, json, threading
13
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
14
+
15
+ PASS = FAIL = 0
16
+ def check(name, cond, detail=""):
17
+ global PASS, FAIL
18
+ PASS += int(cond); FAIL += int(not cond)
19
+ print(f" {'βœ“' if cond else 'βœ—'} {name}" + (f": {detail}" if detail and not cond else ""))
20
+
21
+ # Import triggers sre_patches.apply_all()
22
+ import purpose_agent as pa
23
+ from purpose_agent.types import State, Action, Trajectory, TrajectoryStep, PurposeScore, Heuristic, MemoryTier
24
+ from purpose_agent.memory import MemoryStore, MemoryCard, MemoryKind, MemoryStatus
25
+ from purpose_agent.v2_types import MemoryScope
26
+
27
+ print("═══ SRE Scenario 1: Dict Snapshot During Iteration ═══")
28
+ store = MemoryStore()
29
+ for i in range(100):
30
+ store.add(MemoryCard(kind=MemoryKind.SKILL_CARD, status=MemoryStatus.PROMOTED,
31
+ pattern=f"p{i}", strategy=f"s{i}"))
32
+
33
+ # Simulate concurrent modification during retrieval
34
+ def modify_during_retrieve():
35
+ """Add cards while retrieve is iterating β€” should NOT crash."""
36
+ for i in range(100, 200):
37
+ store.add(MemoryCard(kind=MemoryKind.SKILL_CARD, status=MemoryStatus.PROMOTED,
38
+ pattern=f"p{i}", strategy=f"s{i}"))
39
+
40
+ t = threading.Thread(target=modify_during_retrieve)
41
+ t.start()
42
+ try:
43
+ results = store.retrieve("test query", top_k=10)
44
+ check("S1 No RuntimeError during concurrent modify", True)
45
+ except RuntimeError as e:
46
+ check("S1 No RuntimeError during concurrent modify", False, str(e))
47
+ t.join()
48
+
49
+ print("\n═══ SRE Scenario 2: UNKNOWN Action Rejection ═══")
50
+ mock = pa.MockLLMBackend()
51
+ # Return garbage that parser can't handle
52
+ mock.register_handler("goal-directed agent", "totally unparseable garbage !@#$%")
53
+ mock.set_structured_default({"phi_before":0,"phi_after":0,"reasoning":"x","evidence":"x","confidence":0.5})
54
+
55
+ from purpose_agent.orchestrator import SimpleEnvironment
56
+ env = SimpleEnvironment(execute_fn=lambda a,s: State(data={"x":1}))
57
+ orch = pa.Orchestrator(llm=mock, environment=env, available_actions={"test":"test","DONE":"done"}, critic_mode="standard")
58
+
59
+ try:
60
+ r = orch.run_task(purpose="test", max_steps=2)
61
+ # Should get DONE (from UNKNOWN rejection) not crash
62
+ check("S2 No crash on garbage LLM output", True)
63
+ # The action should have been converted to DONE
64
+ if r.trajectory.steps:
65
+ last_action = r.trajectory.steps[-1].action.name
66
+ check("S2 UNKNOWN β†’ DONE fallback", last_action == "DONE", f"got {last_action}")
67
+ else:
68
+ check("S2 Has steps", False, "no steps recorded")
69
+ except Exception as e:
70
+ check("S2 No crash on garbage LLM output", False, f"{type(e).__name__}: {e}")
71
+
72
+ print("\n═══ SRE Scenario 3: Heuristic Cap (Context Overflow Prevention) ═══")
73
+ mock2 = pa.MockLLMBackend()
74
+ mock2.register_handler("goal-directed agent", json.dumps({"thought":"t","action":{"name":"DONE","params":{}},"expected_delta":"d"}))
75
+ mock2.set_structured_default({"phi_before":0,"phi_after":5,"reasoning":"r","evidence":"e","confidence":0.7})
76
+
77
+ env2 = SimpleEnvironment(execute_fn=lambda a,s: State(data={}))
78
+ orch2 = pa.Orchestrator(llm=mock2, environment=env2, available_actions={"DONE":"done"}, critic_mode="standard")
79
+
80
+ # Inject 200 heuristics (would overflow SLM context without cap)
81
+ for i in range(200):
82
+ orch2.optimizer.heuristic_library.append(Heuristic(
83
+ pattern=f"Pattern {i} " * 10, strategy=f"Strategy {i} " * 10,
84
+ steps=[], tier=MemoryTier.STRATEGIC, q_value=0.5 + i*0.001,
85
+ ))
86
+ orch2.sync_memory()
87
+
88
+ # Build the prompt β€” should be capped, not 200 entries
89
+ prompt = orch2.actor._build_system_prompt()
90
+ # Count heuristic entries in prompt
91
+ heuristic_lines = [l for l in prompt.split("\n") if l.strip().startswith("- When:") or l.strip().startswith("- Pattern")]
92
+ check("S3 Heuristics capped", len(heuristic_lines) <= 10, f"got {len(heuristic_lines)} (should be ≀10)")
93
+ check("S3 Prompt not massive", len(prompt) < 5000, f"prompt is {len(prompt)} chars")
94
+
95
+ print("\n═══ SRE Scenario 4: Thread-Safe ExperienceReplay ═══")
96
+ er = pa.ExperienceReplay(capacity=100)
97
+ errors = []
98
+
99
+ def add_many(start):
100
+ for i in range(50):
101
+ try:
102
+ t = Trajectory(task_description=f"task_{start}_{i}", purpose=f"p_{start}_{i}")
103
+ t.steps.append(TrajectoryStep(
104
+ state_before=State(data={}), action=Action(name="x"),
105
+ state_after=State(data={"i": i}),
106
+ score=PurposeScore(phi_before=0, phi_after=5, delta=5, reasoning="r", evidence="e", confidence=0.8),
107
+ ))
108
+ er.add(t)
109
+ except Exception as e:
110
+ errors.append(str(e))
111
+
112
+ # 4 threads adding concurrently
113
+ threads = [threading.Thread(target=add_many, args=(j*100,)) for j in range(4)]
114
+ for t in threads: t.start()
115
+ for t in threads: t.join()
116
+
117
+ check("S4 No errors in concurrent add", len(errors) == 0, f"{len(errors)} errors")
118
+ check("S4 All items added", er.size > 0, f"size={er.size}")
119
+
120
+ print("\n═══ SRE Scenario 5: None Score Guard ═══")
121
+ t = Trajectory(task_description="test", purpose="test")
122
+ # Add steps with None scores (simulates HITL interrupt mid-eval)
123
+ t.steps.append(TrajectoryStep(
124
+ state_before=State(data={}), action=Action(name="x"),
125
+ state_after=State(data={}), score=None, # None score!
126
+ ))
127
+ t.steps.append(TrajectoryStep(
128
+ state_before=State(data={}), action=Action(name="y"),
129
+ state_after=State(data={}),
130
+ score=PurposeScore(phi_before=0, phi_after=7, delta=7, reasoning="r", evidence="e", confidence=0.9),
131
+ ))
132
+
133
+ try:
134
+ cr = t.cumulative_reward
135
+ check("S5 cumulative_reward with None score", isinstance(cr, float), f"got {cr}")
136
+ except TypeError as e:
137
+ check("S5 cumulative_reward with None score", False, str(e))
138
+
139
+ try:
140
+ td = t.total_delta
141
+ check("S5 total_delta with None score", isinstance(td, float), f"got {td}")
142
+ except TypeError as e:
143
+ check("S5 total_delta with None score", False, str(e))
144
+
145
+ try:
146
+ sr = t.success_rate
147
+ check("S5 success_rate with None score", isinstance(sr, float))
148
+ except TypeError as e:
149
+ check("S5 success_rate with None score", False, str(e))
150
+
151
+ try:
152
+ fp = t.final_phi
153
+ check("S5 final_phi with None score", fp == 7.0 or fp is not None, f"got {fp}")
154
+ except (TypeError, AttributeError) as e:
155
+ check("S5 final_phi with None score", False, str(e))
156
+
157
+ # ═══ REPORT ═══
158
+ print(f"\n{'='*50}")
159
+ print(f" SRE Regression: {PASS} pass, {FAIL} fail")
160
+ print(f" {'ALL PASS βœ“' if FAIL == 0 else f'{FAIL} FAILURES β€” CRITICAL'}")
161
+ print(f"{'='*50}")
162
+ sys.exit(0 if FAIL == 0 else 1)