Rohan03 commited on
Commit
e4105d6
·
verified ·
1 Parent(s): 485ddc5

clean: remove temp scripts, patches, and dev artifacts before public release

Browse files
TEST_FIXES.md DELETED
@@ -1,104 +0,0 @@
1
- # Test Fixes Applied for v3.0.0
2
-
3
- ## Issue 1: Trajectory None guards (FIXED)
4
- - File: `purpose_agent/types.py` — UPDATED
5
- - Changed: cumulative_reward, total_delta, success_rate properties now check both `s.score is not None` AND `s.score.delta is not None`
6
- - Added docstring note that sre_patches.py replaces these at import time
7
- - Baseline and SRE-patched versions now equivalent
8
-
9
- ## Issue 2: Backpressure test flakiness (NEEDS MANUAL FIX)
10
- - File: `tests/test_sprint1_events.py` — T1.6 section
11
- - Problem: async consumer may not start before flooding; terminal event might not arrive
12
- - Fix: Replace the test_backpressure() function with this more robust version:
13
-
14
- ```python
15
- async def test_backpressure():
16
- bus6 = EventBus(max_queue_size=3)
17
- received = []
18
- consumer_started = asyncio.Event()
19
-
20
- async def consumer():
21
- consumer_started.set()
22
- try:
23
- async for event in bus6.subscribe():
24
- received.append(event)
25
- await asyncio.sleep(0.01)
26
- except asyncio.CancelledError:
27
- pass
28
-
29
- task = asyncio.create_task(consumer())
30
- await consumer_started.wait()
31
- await asyncio.sleep(0.05)
32
-
33
- for i in range(20):
34
- bus6.emit(create_event("r6", EventKind.TEXT_DELTA, seq=i, text=f"w{i}"))
35
-
36
- bus6.emit(create_event("r6", EventKind.RUN_FINISHED, seq=99, result="done"))
37
-
38
- await asyncio.sleep(1.0)
39
- bus6.close()
40
- task.cancel()
41
- try:
42
- await asyncio.wait_for(task, timeout=2.0)
43
- except (asyncio.CancelledError, asyncio.TimeoutError):
44
- pass
45
-
46
- has_terminal = any(e.kind == EventKind.RUN_FINISHED for e in received)
47
- return has_terminal
48
- ```
49
-
50
- Key changes:
51
- - Added `consumer_started` Event to ensure consumer is running before flooding
52
- - Increased final wait from 0.5s to 1.0s
53
- - Added `asyncio.wait_for` timeout on task cleanup
54
-
55
- ## Issue 3: prod_test.py API timeout (NEEDS MANUAL FIX)
56
- - File: `tests/prod_test.py`
57
- - Problem: No timeout on OpenRouter API calls; tests could hang
58
- - Fix: Wrap the backend creation with a timeout, add retry logic:
59
-
60
- After line `b = resolve_backend(...)`, add:
61
- ```python
62
- import signal
63
-
64
- class TimeoutError(Exception):
65
- pass
66
-
67
- def timeout_handler(signum, frame):
68
- raise TimeoutError("API call timed out")
69
-
70
- # Set a 60s alarm for API calls
71
- signal.signal(signal.SIGALRM, timeout_handler)
72
- ```
73
-
74
- Or simpler: in the resolve_backend call, add timeout to the OpenAI client:
75
- ```python
76
- # In llm_backend.py OpenAICompatibleBackend.__init__, add:
77
- self.client = OpenAI(
78
- base_url=base_url,
79
- api_key=api_key or os.environ.get("OPENAI_API_KEY"),
80
- timeout=60.0, # 60 second timeout on all API calls
81
- )
82
- ```
83
-
84
- ## Issue 4: validate.py mock resilience (NEEDS MANUAL FIX)
85
- - File: `benchmarks/validate.py`
86
- - Problem: Mock matches on "Learned Strategies" + "None yet" text; fragile if prompt format changes
87
- - Fix: In make_mock(), make the heuristic detection more resilient:
88
-
89
- Change: `has_h = "Learned Strategies" in text and "None yet" not in text`
90
- To: `has_h = ("Learned Strategies" in text or "Learned Strategies" in text) and "None yet" not in text and "heuristics" in text.lower()`
91
-
92
- Or better: check the heuristic count directly:
93
- ```python
94
- has_h = any("When:" in line or "Do:" in line for line in text.split("\n"))
95
- ```
96
-
97
- ## Issue 5: CalculatorTool __import__ blocking (VERIFIED WORKING)
98
- - File: `purpose_agent/tools.py`
99
- - CalculatorTool.execute() validates tokens with: `if re.search(r'[a-zA-Z_]', tokens)`
100
- - After removing known function names (abs, round, sqrt, etc.), any remaining letters are rejected
101
- - `__import__("os")` → after removing known functions, `__import__` and `os` remain → rejected ✓
102
- - Also: AST walker checks Call nodes and rejects unknown function names
103
- - eval() uses `{"__builtins__": {}}` — no builtins available
104
- - Test in benchmark_v3.py: `check("tools.calc_blocks_import", "Error" in calc.run(expression='__import__("os")').output)` — CORRECT
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
apply_fixes.py DELETED
@@ -1,116 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- apply_fixes.py — Apply all 5 test fixes for purpose-agent v3.0.0.
4
-
5
- Run this AFTER cloning the repo, BEFORE running tests:
6
- pip install huggingface_hub
7
- python -c "from huggingface_hub import snapshot_download; snapshot_download('Rohan03/purpose-agent', local_dir='./pa', repo_type='model')"
8
- cd pa
9
- python apply_fixes.py
10
- bash run_all_tests.sh
11
- """
12
- import os
13
- import re
14
-
15
- def fix_file(path, description, old, new):
16
- """Apply a single string replacement to a file."""
17
- if not os.path.exists(path):
18
- print(f" ⚠️ {path} not found — skipping")
19
- return False
20
- with open(path, "r") as f:
21
- content = f.read()
22
- if old not in content:
23
- print(f" ⚠️ Pattern not found in {path} — may already be fixed")
24
- return False
25
- content = content.replace(old, new, 1)
26
- with open(path, "w") as f:
27
- f.write(content)
28
- print(f" ✅ {description}")
29
- return True
30
-
31
-
32
- def main():
33
- print("purpose-agent v3.0.0 — Applying test fixes\n")
34
- fixes = 0
35
-
36
- # ═══ Fix 1: Trajectory None guards (already applied in types.py on repo) ═══
37
- print("Fix 1: Trajectory None guards in types.py")
38
- f1 = fix_file(
39
- "purpose_agent/types.py",
40
- "Updated cumulative_reward to check score.delta is not None",
41
- old=' @property\n def cumulative_reward(self) -> float:\n """Sum of all positive deltas in the trajectory."""\n return sum(\n s.score.delta for s in self.steps\n if s.score is not None and s.score.delta > 0\n )',
42
- new=' @property\n def cumulative_reward(self) -> float:\n """Sum of all positive deltas in the trajectory (None-safe)."""\n total = 0.0\n for s in self.steps:\n if s.score is not None and s.score.delta is not None and s.score.delta > 0:\n total += s.score.delta\n return total',
43
- )
44
- fixes += int(f1)
45
-
46
- f1b = fix_file(
47
- "purpose_agent/types.py",
48
- "Updated total_delta to check score.delta is not None",
49
- old=' @property\n def total_delta(self) -> float:\n """Net state improvement across the entire trajectory."""\n return sum(\n s.score.delta for s in self.steps if s.score is not None\n )',
50
- new=' @property\n def total_delta(self) -> float:\n """Net state improvement across the entire trajectory (None-safe)."""\n total = 0.0\n for s in self.steps:\n if s.score is not None and s.score.delta is not None:\n total += s.score.delta\n return total',
51
- )
52
- fixes += int(f1b)
53
-
54
- f1c = fix_file(
55
- "purpose_agent/types.py",
56
- "Updated success_rate to check score.delta is not None",
57
- old=' scored = [s for s in self.steps if s.score is not None]\n if not scored:\n return 0.0\n return sum(1 for s in scored if s.score.improved) / len(scored)',
58
- new=' scored = [s for s in self.steps if s.score is not None and s.score.delta is not None]\n if not scored:\n return 0.0\n return sum(1 for s in scored if s.score.improved) / len(scored)',
59
- )
60
- fixes += int(f1c)
61
-
62
- # ═══ Fix 2: Backpressure test robustness (already applied in repo) ═══
63
- print("\nFix 2: Backpressure test T1.6 in test_sprint1_events.py")
64
- f2 = fix_file(
65
- "tests/test_sprint1_events.py",
66
- "Added consumer_started Event sync to backpressure test",
67
- old=' bus6 = EventBus(max_queue_size=3) # Very small queue\n\n received = []\n\n async def consumer():\n async for event in bus6.subscribe():\n received.append(event)\n await asyncio.sleep(0.01) # Slow consumer\n\n # Start consumer\n task = asyncio.create_task(consumer())\n await asyncio.sleep(0.05)',
68
- new=' bus6 = EventBus(max_queue_size=3)\n received = []\n consumer_started = asyncio.Event()\n\n async def consumer():\n consumer_started.set()\n try:\n async for event in bus6.subscribe():\n received.append(event)\n await asyncio.sleep(0.01)\n except asyncio.CancelledError:\n pass\n\n task = asyncio.create_task(consumer())\n await consumer_started.wait()\n await asyncio.sleep(0.05)',
69
- )
70
- fixes += int(f2)
71
-
72
- f2b = fix_file(
73
- "tests/test_sprint1_events.py",
74
- "Added longer wait and wait_for on task cleanup",
75
- old=' await asyncio.sleep(0.5)\n bus6.close()\n task.cancel()\n try:\n await task\n except asyncio.CancelledError:\n pass',
76
- new=' await asyncio.sleep(1.0)\n bus6.close()\n task.cancel()\n try:\n await asyncio.wait_for(task, timeout=2.0)\n except (asyncio.CancelledError, asyncio.TimeoutError):\n pass',
77
- )
78
- fixes += int(f2b)
79
-
80
- # ═══ Fix 3: OpenAI backend timeout (already applied in prod_test.py) ═══
81
- print("\nFix 3: Add 60s timeout to OpenAI-compatible backend")
82
- f3 = fix_file(
83
- "purpose_agent/llm_backend.py",
84
- "Added timeout parameter to OpenAICompatibleBackend",
85
- old=' def __init__(\n self,\n model: str = "gpt-4o",\n base_url: str | None = None,\n api_key: str | None = None,\n ):\n from openai import OpenAI\n\n self.model = model\n self.client = OpenAI(\n base_url=base_url,\n api_key=api_key or os.environ.get("OPENAI_API_KEY"),\n )',
86
- new=' def __init__(\n self,\n model: str = "gpt-4o",\n base_url: str | None = None,\n api_key: str | None = None,\n timeout: float = 60.0,\n ):\n from openai import OpenAI\n\n self.model = model\n self.client = OpenAI(\n base_url=base_url,\n api_key=api_key or os.environ.get("OPENAI_API_KEY"),\n timeout=timeout,\n )',
87
- )
88
- fixes += int(f3)
89
-
90
- # ═══ Fix 4: validate.py mock heuristic detection ═══
91
- print("\nFix 4: Make mock heuristic detection more resilient")
92
- f4 = fix_file(
93
- "benchmarks/validate.py",
94
- "Broadened heuristic detection from exact string to include 'When:' pattern",
95
- old=' has_h = "Learned Strategies" in text and "None yet" not in text',
96
- new=' has_h = ("Learned Strategies" in text or "When:" in text) and "None yet" not in text',
97
- )
98
- fixes += int(f4)
99
-
100
- # ═══ Fix 5: CalculatorTool verification (no code change needed) ═══
101
- print("\nFix 5: CalculatorTool __import__ blocking — VERIFIED (no change needed)")
102
- print(" ✅ CalculatorTool.execute() rejects letters after removing known functions")
103
- print(" ✅ AST walker rejects unknown function calls")
104
- print(" ✅ eval() uses empty __builtins__")
105
- print(" ✅ benchmark_v3.py test: 'Error' in calc.run('__import__(\"os\")').output")
106
-
107
- # ═══ Summary ═══
108
- print(f"\n{'='*50}")
109
- print(f" Fixes applied: {fixes}")
110
- print(f" Fix 5: Verified (no change needed)")
111
- print(f"{'='*50}")
112
- print("\n Next: bash run_all_tests.sh")
113
-
114
-
115
- if __name__ == "__main__":
116
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
apply_llm_timeout.py DELETED
@@ -1,15 +0,0 @@
1
- #!/usr/bin/env python3
2
- """Apply the OpenAI backend timeout fix directly."""
3
- import os
4
- path = os.path.join(os.path.dirname(__file__), "purpose_agent", "llm_backend.py")
5
- with open(path, "r") as f:
6
- content = f.read()
7
- old = ' def __init__(\n self,\n model: str = "gpt-4o",\n base_url: str | None = None,\n api_key: str | None = None,\n ):\n from openai import OpenAI\n\n self.model = model\n self.client = OpenAI(\n base_url=base_url,\n api_key=api_key or os.environ.get("OPENAI_API_KEY"),\n )'
8
- new = ' def __init__(\n self,\n model: str = "gpt-4o",\n base_url: str | None = None,\n api_key: str | None = None,\n timeout: float = 60.0,\n ):\n from openai import OpenAI\n\n self.model = model\n self.client = OpenAI(\n base_url=base_url,\n api_key=api_key or os.environ.get("OPENAI_API_KEY"),\n timeout=timeout,\n )'
9
- if old in content:
10
- content = content.replace(old, new, 1)
11
- with open(path, "w") as f:
12
- f.write(content)
13
- print("✅ Applied OpenAI backend timeout fix (60s)")
14
- else:
15
- print("⚠️ Pattern not found — may already be patched")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
apply_validate_fix.py DELETED
@@ -1,15 +0,0 @@
1
- #!/usr/bin/env python3
2
- """Apply the validate.py mock heuristic detection fix."""
3
- import os
4
- path = os.path.join(os.path.dirname(__file__), "benchmarks", "validate.py")
5
- with open(path, "r") as f:
6
- content = f.read()
7
- old = ' has_h = "Learned Strategies" in text and "None yet" not in text'
8
- new = ' has_h = ("Learned Strategies" in text or "When:" in text) and "None yet" not in text'
9
- if old in content:
10
- content = content.replace(old, new, 1)
11
- with open(path, "w") as f:
12
- f.write(content)
13
- print("✅ Applied validate.py heuristic detection fix")
14
- else:
15
- print("⚠️ Pattern not found — may already be patched")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmarks/results/track2_report.txt DELETED
@@ -1,32 +0,0 @@
1
- ╔════════════════════════════════════════════════════╗
2
- ║ Purpose Agent — Track 2 Validation Report ║
3
- ╚════════════════════════════════════════════════════╝
4
-
5
- ═══ Improvement Curves ═══
6
- Task Run Steps Φ Pass% Heur
7
- ────────────────────────────────────────────────
8
- fibonacci 1 2 5.0 50% 3
9
- fibonacci 2 1 5.0 100% 9
10
- fibonacci 3 1 5.0 100% 18
11
- → Δ(Φ) = +0.0 (no change)
12
-
13
- factorial 1 2 5.0 0% 3
14
- factorial 2 1 5.0 100% 9
15
- factorial 3 1 5.0 100% 18
16
- → Δ(Φ) = +0.0 (no change)
17
-
18
- ═══ Cold vs Warm ═══
19
- fibonacci cold=5.0 warm=5.0 Δ=+0.0
20
- factorial cold=5.0 warm=5.0 Δ=+0.0
21
-
22
- ═══ Cross-Task Transfer (['fibonacci', 'factorial'] → ['palindrome', 'fizzbuzz']) ═══
23
- 30 heuristics transferred
24
- palindrome: ✓ Φ=5.0
25
- fizzbuzz: ✓ Φ=5.0
26
-
27
- ═══ Adversarial Robustness: 100% (8/8) ═══
28
-
29
- ═══ VERDICT ═══
30
- ✗ Self-improvement: NOT demonstrated
31
- ✗ Cold/warm: no benefit from memory
32
- ✓ Immune system: 100% adversarial accuracy
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmarks/results/track2_results.json DELETED
@@ -1,104 +0,0 @@
1
- {
2
- "curves": {
3
- "fibonacci": [
4
- {
5
- "run": 1,
6
- "steps": 2,
7
- "phi": 5.0,
8
- "pass_rate": 0.5,
9
- "all_passed": false,
10
- "heuristics": 3,
11
- "time": 0.01
12
- },
13
- {
14
- "run": 2,
15
- "steps": 1,
16
- "phi": 5.0,
17
- "pass_rate": 1.0,
18
- "all_passed": true,
19
- "heuristics": 9,
20
- "time": 0.0
21
- },
22
- {
23
- "run": 3,
24
- "steps": 1,
25
- "phi": 5.0,
26
- "pass_rate": 1.0,
27
- "all_passed": true,
28
- "heuristics": 18,
29
- "time": 0.0
30
- }
31
- ],
32
- "factorial": [
33
- {
34
- "run": 1,
35
- "steps": 2,
36
- "phi": 5.0,
37
- "pass_rate": 0.0,
38
- "all_passed": false,
39
- "heuristics": 3,
40
- "time": 0.0
41
- },
42
- {
43
- "run": 2,
44
- "steps": 1,
45
- "phi": 5.0,
46
- "pass_rate": 1.0,
47
- "all_passed": true,
48
- "heuristics": 9,
49
- "time": 0.0
50
- },
51
- {
52
- "run": 3,
53
- "steps": 1,
54
- "phi": 5.0,
55
- "pass_rate": 1.0,
56
- "all_passed": true,
57
- "heuristics": 18,
58
- "time": 0.0
59
- }
60
- ]
61
- },
62
- "cold_warm": [
63
- {
64
- "task": "fibonacci",
65
- "cold_phi": 5.0,
66
- "warm_phi": 5.0,
67
- "delta": 0.0,
68
- "improved": false
69
- },
70
- {
71
- "task": "factorial",
72
- "cold_phi": 5.0,
73
- "warm_phi": 5.0,
74
- "delta": 0.0,
75
- "improved": false
76
- }
77
- ],
78
- "transfer": {
79
- "train": [
80
- "fibonacci",
81
- "factorial"
82
- ],
83
- "test": [
84
- "palindrome",
85
- "fizzbuzz"
86
- ],
87
- "heuristics": 30,
88
- "results": {
89
- "palindrome": {
90
- "phi": 5.0,
91
- "passed": true
92
- },
93
- "fizzbuzz": {
94
- "phi": 5.0,
95
- "passed": true
96
- }
97
- }
98
- },
99
- "adversarial": {
100
- "total": 8,
101
- "correct": 8,
102
- "accuracy": 1.0
103
- }
104
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build_and_publish.py DELETED
@@ -1,147 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- build_and_publish.py — Build purpose-agent v3.0.0 and publish to PyPI.
4
-
5
- Prerequisites:
6
- pip install build twine
7
-
8
- Usage:
9
- python build_and_publish.py # Build + publish
10
- python build_and_publish.py --build-only # Build only, don't publish
11
- python build_and_publish.py --check # Build + twine check, don't publish
12
-
13
- Environment:
14
- PYPI_TOKEN — PyPI API token (or pass as argument)
15
- """
16
- import os
17
- import sys
18
- import shutil
19
- import subprocess
20
-
21
-
22
- def run(cmd, **kwargs):
23
- """Run a command and check for errors."""
24
- print(f" $ {cmd}")
25
- result = subprocess.run(cmd, shell=True, **kwargs)
26
- if result.returncode != 0:
27
- print(f" ❌ Command failed with exit code {result.returncode}")
28
- sys.exit(1)
29
- return result
30
-
31
-
32
- def main():
33
- build_only = "--build-only" in sys.argv
34
- check_only = "--check" in sys.argv
35
-
36
- # Get PyPI token
37
- pypi_token = os.environ.get("PYPI_TOKEN", "")
38
- if not build_only and not check_only:
39
- # Try to get from command line args
40
- for arg in sys.argv:
41
- if arg.startswith("pypi-"):
42
- pypi_token = arg
43
- break
44
- if not pypi_token:
45
- print("⚠️ No PyPI token found. Set PYPI_TOKEN env var or pass as argument.")
46
- print(" Usage: python build_and_publish.py pypi-AgE...")
47
- if not build_only:
48
- sys.exit(1)
49
-
50
- # Verify version
51
- print("\n═══ Step 1: Verify version ═══")
52
- sys.path.insert(0, ".")
53
- import purpose_agent
54
- version = purpose_agent.__version__
55
- print(f" Version: {version}")
56
- if version != "3.0.0":
57
- print(f" ❌ Version is {version}, expected 3.0.0!")
58
- sys.exit(1)
59
- print(f" ✅ Version confirmed: {version}")
60
-
61
- # Verify imports
62
- print("\n═══ Step 2: Verify imports ═══")
63
- missing = [n for n in purpose_agent.__all__ if not hasattr(purpose_agent, n)]
64
- if missing:
65
- print(f" ❌ Missing exports: {missing}")
66
- sys.exit(1)
67
- print(f" ✅ All {len(purpose_agent.__all__)} exports importable")
68
-
69
- # Clean old builds
70
- print("\n═══ Step 3: Clean old builds ═══")
71
- for path in ["dist", "build", "*.egg-info"]:
72
- if os.path.exists(path):
73
- shutil.rmtree(path)
74
- print(f" Removed: {path}")
75
- # Also clean any .egg-info in current dir
76
- for item in os.listdir("."):
77
- if item.endswith(".egg-info"):
78
- shutil.rmtree(item)
79
- print(f" Removed: {item}")
80
- print(" ✅ Cleaned")
81
-
82
- # Build
83
- print("\n═══ Step 4: Build sdist + wheel ═══")
84
- run("python -m build")
85
-
86
- # List artifacts
87
- print("\n Artifacts:")
88
- if os.path.exists("dist"):
89
- for f in os.listdir("dist"):
90
- size = os.path.getsize(os.path.join("dist", f))
91
- print(f" {f} ({size:,} bytes)")
92
-
93
- # Twine check
94
- print("\n═══ Step 5: Twine check ═══")
95
- run("twine check dist/*")
96
-
97
- if check_only:
98
- print("\n ✅ Build + check complete. Not publishing (use without --check).")
99
- return
100
-
101
- if build_only:
102
- print("\n ✅ Build complete. Not publishing (use without --build-only).")
103
- return
104
-
105
- # Publish
106
- print("\n═══ Step 6: Publish to PyPI ═══")
107
- print(f" Package: purpose-agent=={version}")
108
- print(f" Target: https://pypi.org/project/purpose-agent/")
109
- print()
110
-
111
- run(
112
- f'twine upload dist/* '
113
- f'--username __token__ '
114
- f'--password "{pypi_token}"'
115
- )
116
-
117
- # Verify
118
- print("\n═══ Step 7: Verify on PyPI ═══")
119
- print(f" Waiting 10s for PyPI to index...")
120
- subprocess.run(["sleep", "10"])
121
-
122
- print(f" Installing from PyPI...")
123
- run(f'pip install purpose-agent=={version}')
124
-
125
- result = subprocess.run(
126
- ['python', '-c', f'''
127
- import purpose_agent as pa
128
- print(f" v{{pa.__version__}} — {{len(pa.__all__)}} exports")
129
- assert pa.__version__ == "{version}", f"Version mismatch: {{pa.__version__}}"
130
- '''],
131
- capture_output=True, text=True,
132
- )
133
- print(result.stdout)
134
- if result.returncode != 0:
135
- print(f" ⚠️ Verification failed: {result.stderr}")
136
- else:
137
- print(" ✅ PyPI install verified!")
138
-
139
- print(f"\n{'='*60}")
140
- print(f" ✅ purpose-agent=={version} PUBLISHED TO PYPI!")
141
- print(f" 📦 https://pypi.org/project/purpose-agent/{version}/")
142
- print(f" 📦 https://huggingface.co/Rohan03/purpose-agent")
143
- print(f"{'='*60}")
144
-
145
-
146
- if __name__ == "__main__":
147
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
local_test_with_api.sh DELETED
@@ -1,87 +0,0 @@
1
- #!/usr/bin/env bash
2
- # ═══════════════════════════════════════════════════════════════
3
- # purpose-agent v3.0.0 — Local Test Runner (including cloud model)
4
- #
5
- # Run on your own machine (no HF credits needed):
6
- # git clone https://huggingface.co/Rohan03/purpose-agent pa && cd pa
7
- # pip install -e .
8
- # export OPENROUTER_API_KEY="sk-or-v1-..."
9
- # bash local_test_with_api.sh
10
- # ═══════════════════════════════════════════════════════════════
11
- set -e
12
-
13
- PASS=0; FAIL=0; TOTAL=0
14
-
15
- run_test() {
16
- local name="$1"; local cmd="$2"
17
- echo ""
18
- echo "══════════════════════════════════════════════════════════"
19
- echo " Running: $name"
20
- echo "══════════════════════════════════════════════════════════"
21
- if eval "$cmd"; then
22
- echo " ✅ $name PASSED"; PASS=$((PASS+1))
23
- else
24
- echo " ❌ $name FAILED"; FAIL=$((FAIL+1))
25
- fi
26
- TOTAL=$((TOTAL+1))
27
- }
28
-
29
- echo "╔══════════════════════════════════════════════════════════╗"
30
- echo "║ purpose-agent v3.0.0 — Full Test Suite (Local + Cloud)║"
31
- echo "╚══════════════════════════════════════════════════════════╝"
32
-
33
- # Pre-flight
34
- echo "═══ Pre-flight ═══"
35
- python -c "
36
- import purpose_agent as pa
37
- print(f' v{pa.__version__} — {len(pa.__all__)} exports')
38
- assert pa.__version__ == '3.0.0', f'Version: {pa.__version__}'
39
- assert len(pa.__all__) >= 110, f'Exports: {len(pa.__all__)}'
40
- missing = [n for n in pa.__all__ if not hasattr(pa, n)]
41
- assert not missing, f'Missing: {missing}'
42
- print(' ✅ All exports importable')
43
- "
44
-
45
- # Apply fixes
46
- echo ""
47
- echo "═══ Applying test fixes ═══"
48
- python apply_fixes.py
49
-
50
- # Layer 1: Unit Tests
51
- run_test "test_core" "python tests/test_core.py"
52
- run_test "test_public_api_211" "python tests/compat/test_public_api_211.py"
53
- run_test "test_first_principles" "python tests/test_first_principles.py"
54
- run_test "test_hardening" "python tests/test_hardening.py"
55
- run_test "test_sre_regression" "python tests/test_sre_regression.py"
56
-
57
- # Layer 2: Feature Tests
58
- run_test "test_sprint1_events" "python tests/test_sprint1_events.py"
59
- run_test "test_sprint2_checkpoint" "python tests/test_sprint2_checkpoint.py"
60
- run_test "test_sprint3_homeostasis" "python tests/test_sprint3_homeostasis.py"
61
- run_test "test_sprint4_8_protocols" "python tests/test_sprint4_8_protocols.py"
62
- run_test "test_track_c" "python tests/test_track_c.py"
63
- run_test "test_track_d" "python tests/test_track_d.py"
64
-
65
- # Layer 3: Integration
66
- run_test "validate_quick" "python benchmarks/validate.py --quick"
67
- run_test "benchmark_v3" "python -m purpose_agent.benchmark_v3"
68
-
69
- # Layer 4: Cloud Model Tests
70
- if [ -n "$OPENROUTER_API_KEY" ]; then
71
- run_test "prod_test (real model)" "python tests/prod_test.py"
72
- else
73
- echo " ⚠️ OPENROUTER_API_KEY not set — skipping cloud tests"
74
- fi
75
-
76
- # Report
77
- echo ""
78
- echo "╔══════════════════════════════════════════════════════════╗"
79
- echo "║ RESULTS: $PASS/$TOTAL passed, $FAIL failed"
80
- if [ $FAIL -eq 0 ]; then
81
- echo "║ ✅ ALL TESTS PASSED — READY TO PUBLISH"
82
- echo "║ Next: python build_and_publish.py"
83
- else
84
- echo "║ ❌ $FAIL FAILURES — FIX BEFORE PUBLISHING"
85
- fi
86
- echo "╚══════════════════════════════════════════════════════════╝"
87
- exit $FAIL
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
patches/llm_backend_timeout.patch DELETED
@@ -1,13 +0,0 @@
1
- --- a/purpose_agent/llm_backend.py
2
- +++ b/purpose_agent/llm_backend.py
3
- @@ -199,7 +199,7 @@
4
- ):
5
- from openai import OpenAI
6
-
7
- self.model = model
8
- self.client = OpenAI(
9
- base_url=base_url,
10
- - api_key=api_key or os.environ.get("OPENAI_API_KEY"),
11
- + api_key=api_key or os.environ.get("OPENAI_API_KEY"),
12
- + timeout=60.0,
13
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
patches/openai_backend_init.py DELETED
@@ -1,15 +0,0 @@
1
- def __init__(
2
- self,
3
- model: str = "gpt-4o",
4
- base_url: str | None = None,
5
- api_key: str | None = None,
6
- timeout: float = 60.0,
7
- ):
8
- from openai import OpenAI
9
-
10
- self.model = model
11
- self.client = OpenAI(
12
- base_url=base_url,
13
- api_key=api_key or os.environ.get("OPENAI_API_KEY"),
14
- timeout=timeout,
15
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
patches/validate_mock_fix.patch DELETED
@@ -1,11 +0,0 @@
1
- --- a/benchmarks/validate.py
2
- +++ b/benchmarks/validate.py
3
- @@ -107,7 +107,7 @@
4
- def make_mock(task_name):
5
- mock = MockLLMBackend()
6
- t = TASKS[task_name]
7
- def actor(msgs):
8
- text = " ".join(m.content for m in msgs)
9
- - has_h = "Learned Strategies" in text and "None yet" not in text
10
- + has_h = ("Learned Strategies" in text or "When:" in text) and "None yet" not in text
11
- code = t["good"] if has_h else t["bad"]
 
 
 
 
 
 
 
 
 
 
 
 
run_all_tests.sh DELETED
@@ -1,142 +0,0 @@
1
- #!/usr/bin/env bash
2
- # ═══════════════════════════════════════════════════════════════
3
- # purpose-agent v3.0.0 — Complete Test Suite Runner
4
- #
5
- # Usage:
6
- # chmod +x run_all_tests.sh
7
- # ./run_all_tests.sh # All tests (mock only)
8
- # ./run_all_tests.sh --prod # Include real model tests (needs OPENROUTER_API_KEY)
9
- # ═══════════════════════════════════════════════════════════════
10
- set -e
11
-
12
- PASS=0
13
- FAIL=0
14
- TOTAL=0
15
-
16
- run_test() {
17
- local name="$1"
18
- local cmd="$2"
19
- echo ""
20
- echo "══════════════════════════════════════════════════════════"
21
- echo " Running: $name"
22
- echo "══════════════════════════════════════════════════════════"
23
-
24
- if eval "$cmd"; then
25
- echo " ✅ $name PASSED"
26
- PASS=$((PASS + 1))
27
- else
28
- echo " ❌ $name FAILED"
29
- FAIL=$((FAIL + 1))
30
- fi
31
- TOTAL=$((TOTAL + 1))
32
- }
33
-
34
- echo "╔══════════════════════════════════════════════════════════╗"
35
- echo "║ purpose-agent v3.0.0 — Complete Test Suite ║"
36
- echo "╚══════════════════════════════════════════════════════════╝"
37
- echo ""
38
-
39
- # ── Pre-flight: verify package imports ──
40
- echo "═══ Pre-flight: Package Import Check ═══"
41
- python -c "
42
- import purpose_agent as pa
43
- print(f' v{pa.__version__} — {len(pa.__all__)} exports')
44
- assert pa.__version__ == '3.0.0', f'Version mismatch: {pa.__version__}'
45
- assert len(pa.__all__) >= 110, f'Not enough exports: {len(pa.__all__)}'
46
- missing = [n for n in pa.__all__ if not hasattr(pa, n)]
47
- assert len(missing) == 0, f'Missing exports: {missing}'
48
- print(' ✅ All exports importable')
49
- "
50
-
51
- # ═══════════════════════════════════════════════════════════════
52
- # LAYER 1: Unit Tests
53
- # ═══════════════════════════════════════════════════════════════
54
-
55
- run_test "test_core (basic loop, Φ bounds, optimizer, replay, immune)" \
56
- "python tests/test_core.py"
57
-
58
- run_test "test_public_api_211 (all 120+ exports, Level 1/2/3)" \
59
- "python tests/compat/test_public_api_211.py"
60
-
61
- run_test "test_first_principles (state-delta O(1), falsification, PEP 578)" \
62
- "python tests/test_first_principles.py"
63
-
64
- run_test "test_hardening (null safety, timeouts, validation)" \
65
- "python tests/test_hardening.py"
66
-
67
- run_test "test_sre_regression (5 critical vulnerability scenarios)" \
68
- "python tests/test_sre_regression.py"
69
-
70
- # ═══════════════════════════════════════════════════════════════
71
- # LAYER 2: Feature Tests
72
- # ═══════════════════════════════════════════════════════════════
73
-
74
- run_test "test_sprint1_events (event bus, lanes, CoT rejection)" \
75
- "python tests/test_sprint1_events.py"
76
-
77
- run_test "test_sprint2_checkpoint (durable execution, resume, idempotency)" \
78
- "python tests/test_sprint2_checkpoint.py"
79
-
80
- run_test "test_sprint3_homeostasis (memory budget, consolidation, hibernation)" \
81
- "python tests/test_sprint3_homeostasis.py"
82
-
83
- run_test "test_sprint4_8_protocols (MCP, A2A, AG-UI, AGENTS.md, quorum)" \
84
- "python tests/test_sprint4_8_protocols.py"
85
-
86
- run_test "test_track_c (routing, MAS generator, skills)" \
87
- "python tests/test_track_c.py"
88
-
89
- run_test "test_track_d (fingerprint, dataset, prompt pack, optimizer, distillation)" \
90
- "python tests/test_track_d.py"
91
-
92
- # ═══════════════════════════════════════════════════════════════
93
- # LAYER 3: Integration Tests
94
- # ═══════════════════════════════════════════════════════════════
95
-
96
- run_test "validate.py --quick (improvement curves + adversarial)" \
97
- "python benchmarks/validate.py --quick"
98
-
99
- run_test "benchmark_v3 (35+ robustness checks across all subsystems)" \
100
- "python -m purpose_agent.benchmark_v3"
101
-
102
- # ═══════════════════════════════════════════════════════════════
103
- # LAYER 4: Production Tests (optional — needs API key)
104
- # ═══════════════════════════════════════════════════════════════
105
-
106
- if [ "$1" = "--prod" ]; then
107
- if [ -z "$OPENROUTER_API_KEY" ]; then
108
- echo "⚠️ OPENROUTER_API_KEY not set — skipping prod tests"
109
- echo " Set it with: export OPENROUTER_API_KEY=sk-or-v1-..."
110
- else
111
- run_test "prod_test (real model Level 1/2/3 + coding + security)" \
112
- "python tests/prod_test.py"
113
- fi
114
- else
115
- echo ""
116
- echo " ℹ️ Production tests skipped (use --prod flag to run with real model)"
117
- fi
118
-
119
- # ═══════════════════════════════════════════════════════════════
120
- # FINAL REPORT
121
- # ═══════════════════════════════════════════════════════════════
122
-
123
- echo ""
124
- echo "╔══════════════════════════════════════════════════════════╗"
125
- echo "║ FINAL RESULTS ║"
126
- echo "╚══════════════════════════════════════════════════════════╝"
127
- echo ""
128
- echo " Total suites: $TOTAL"
129
- echo " Passed: $PASS"
130
- echo " Failed: $FAIL"
131
- echo ""
132
-
133
- if [ $FAIL -eq 0 ]; then
134
- echo " ✅ ALL $TOTAL TEST SUITES PASSED — ZERO FAILURES"
135
- echo ""
136
- echo " Ready to publish: purpose-agent==3.0.0"
137
- echo " Next step: python build_and_publish.py"
138
- exit 0
139
- else
140
- echo " ❌ $FAIL SUITES FAILED — FIX BEFORE PUBLISHING"
141
- exit 1
142
- fi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tests/results/launch_readiness.json DELETED
@@ -1,602 +0,0 @@
1
- {
2
- "pass": 119,
3
- "fail": 0,
4
- "warn": 0,
5
- "results": [
6
- {
7
- "category": "imports",
8
- "test": "import purpose_agent",
9
- "status": "PASS"
10
- },
11
- {
12
- "category": "imports",
13
- "test": "import types",
14
- "status": "PASS"
15
- },
16
- {
17
- "category": "imports",
18
- "test": "import llm_backend",
19
- "status": "PASS"
20
- },
21
- {
22
- "category": "imports",
23
- "test": "import actor",
24
- "status": "PASS"
25
- },
26
- {
27
- "category": "imports",
28
- "test": "import purpose_function",
29
- "status": "PASS"
30
- },
31
- {
32
- "category": "imports",
33
- "test": "import experience_replay",
34
- "status": "PASS"
35
- },
36
- {
37
- "category": "imports",
38
- "test": "import optimizer",
39
- "status": "PASS"
40
- },
41
- {
42
- "category": "imports",
43
- "test": "import orchestrator",
44
- "status": "PASS"
45
- },
46
- {
47
- "category": "imports",
48
- "test": "import slm_backends",
49
- "status": "PASS"
50
- },
51
- {
52
- "category": "imports",
53
- "test": "import streaming",
54
- "status": "PASS"
55
- },
56
- {
57
- "category": "imports",
58
- "test": "import tools",
59
- "status": "PASS"
60
- },
61
- {
62
- "category": "imports",
63
- "test": "import observability",
64
- "status": "PASS"
65
- },
66
- {
67
- "category": "imports",
68
- "test": "import multi_agent",
69
- "status": "PASS"
70
- },
71
- {
72
- "category": "imports",
73
- "test": "import hitl",
74
- "status": "PASS"
75
- },
76
- {
77
- "category": "imports",
78
- "test": "import evaluation",
79
- "status": "PASS"
80
- },
81
- {
82
- "category": "imports",
83
- "test": "import registry",
84
- "status": "PASS"
85
- },
86
- {
87
- "category": "imports",
88
- "test": "import unified",
89
- "status": "PASS"
90
- },
91
- {
92
- "category": "imports",
93
- "test": "import easy",
94
- "status": "PASS"
95
- },
96
- {
97
- "category": "imports",
98
- "test": "import v2_types",
99
- "status": "PASS"
100
- },
101
- {
102
- "category": "imports",
103
- "test": "import trace",
104
- "status": "PASS"
105
- },
106
- {
107
- "category": "imports",
108
- "test": "import memory",
109
- "status": "PASS"
110
- },
111
- {
112
- "category": "imports",
113
- "test": "import compiler",
114
- "status": "PASS"
115
- },
116
- {
117
- "category": "imports",
118
- "test": "import immune",
119
- "status": "PASS"
120
- },
121
- {
122
- "category": "imports",
123
- "test": "import memory_ci",
124
- "status": "PASS"
125
- },
126
- {
127
- "category": "imports",
128
- "test": "import evalport",
129
- "status": "PASS"
130
- },
131
- {
132
- "category": "imports",
133
- "test": "import benchmark_v2",
134
- "status": "PASS"
135
- },
136
- {
137
- "category": "imports",
138
- "test": "import meta_rewarding",
139
- "status": "PASS"
140
- },
141
- {
142
- "category": "imports",
143
- "test": "import self_taught",
144
- "status": "PASS"
145
- },
146
- {
147
- "category": "imports",
148
- "test": "import prompt_optimizer",
149
- "status": "PASS"
150
- },
151
- {
152
- "category": "imports",
153
- "test": "import llm_compiler",
154
- "status": "PASS"
155
- },
156
- {
157
- "category": "imports",
158
- "test": "import retroformer",
159
- "status": "PASS"
160
- },
161
- {
162
- "category": "imports",
163
- "test": "import robust_parser",
164
- "status": "PASS"
165
- },
166
- {
167
- "category": "imports",
168
- "test": "import breakthroughs",
169
- "status": "PASS"
170
- },
171
- {
172
- "category": "instantiate",
173
- "test": "State",
174
- "status": "PASS"
175
- },
176
- {
177
- "category": "instantiate",
178
- "test": "Action",
179
- "status": "PASS"
180
- },
181
- {
182
- "category": "instantiate",
183
- "test": "MockLLMBackend",
184
- "status": "PASS"
185
- },
186
- {
187
- "category": "instantiate",
188
- "test": "ExperienceReplay",
189
- "status": "PASS"
190
- },
191
- {
192
- "category": "instantiate",
193
- "test": "ToolRegistry",
194
- "status": "PASS"
195
- },
196
- {
197
- "category": "instantiate",
198
- "test": "CalculatorTool",
199
- "status": "PASS"
200
- },
201
- {
202
- "category": "instantiate",
203
- "test": "PythonExecTool",
204
- "status": "PASS"
205
- },
206
- {
207
- "category": "instantiate",
208
- "test": "CostTracker",
209
- "status": "PASS"
210
- },
211
- {
212
- "category": "instantiate",
213
- "test": "CallbackManager",
214
- "status": "PASS"
215
- },
216
- {
217
- "category": "instantiate",
218
- "test": "Agent",
219
- "status": "PASS"
220
- },
221
- {
222
- "category": "instantiate",
223
- "test": "KnowledgeStore",
224
- "status": "PASS"
225
- },
226
- {
227
- "category": "instantiate",
228
- "test": "Graph",
229
- "status": "PASS"
230
- },
231
- {
232
- "category": "instantiate",
233
- "test": "RunMode",
234
- "status": "PASS"
235
- },
236
- {
237
- "category": "instantiate",
238
- "test": "Trace",
239
- "status": "PASS"
240
- },
241
- {
242
- "category": "instantiate",
243
- "test": "MemoryStore",
244
- "status": "PASS"
245
- },
246
- {
247
- "category": "instantiate",
248
- "test": "MemoryCard",
249
- "status": "PASS"
250
- },
251
- {
252
- "category": "instantiate",
253
- "test": "MemoryCI",
254
- "status": "PASS"
255
- },
256
- {
257
- "category": "instantiate",
258
- "test": "MixtureOfHeuristics",
259
- "status": "PASS"
260
- },
261
- {
262
- "category": "instantiate",
263
- "test": "AdversarialHardener",
264
- "status": "PASS"
265
- },
266
- {
267
- "category": "core",
268
- "test": "Full loop completes",
269
- "status": "PASS"
270
- },
271
- {
272
- "category": "core",
273
- "test": "Trajectory has steps",
274
- "status": "PASS"
275
- },
276
- {
277
- "category": "core",
278
- "test": "Final state exists",
279
- "status": "PASS"
280
- },
281
- {
282
- "category": "phi",
283
- "test": "phi_before in [0,10]",
284
- "status": "PASS"
285
- },
286
- {
287
- "category": "phi",
288
- "test": "phi_after in [0,10]",
289
- "status": "PASS"
290
- },
291
- {
292
- "category": "phi",
293
- "test": "confidence in [0,1]",
294
- "status": "PASS"
295
- },
296
- {
297
- "category": "optimizer",
298
- "test": "Produces heuristics",
299
- "status": "PASS"
300
- },
301
- {
302
- "category": "replay",
303
- "test": "Store works",
304
- "status": "PASS"
305
- },
306
- {
307
- "category": "replay",
308
- "test": "Retrieve works",
309
- "status": "PASS"
310
- },
311
- {
312
- "category": "replay",
313
- "test": "Clear works",
314
- "status": "PASS"
315
- },
316
- {
317
- "category": "backend",
318
- "test": "Strip <think> basic",
319
- "status": "PASS"
320
- },
321
- {
322
- "category": "backend",
323
- "test": "Strip <think> multiline",
324
- "status": "PASS"
325
- },
326
- {
327
- "category": "backend",
328
- "test": "Strip unclosed <think>",
329
- "status": "PASS"
330
- },
331
- {
332
- "category": "backend",
333
- "test": "No tags passthrough",
334
- "status": "PASS"
335
- },
336
- {
337
- "category": "routing",
338
- "test": "ollama: prefix",
339
- "status": "PASS"
340
- },
341
- {
342
- "category": "routing",
343
- "test": "auto-detect ollama model",
344
- "status": "PASS"
345
- },
346
- {
347
- "category": "tools",
348
- "test": "Calculator safe: 2+3*4=14",
349
- "status": "PASS"
350
- },
351
- {
352
- "category": "tools",
353
- "test": "Calculator safe: sqrt(16)=4.0",
354
- "status": "PASS"
355
- },
356
- {
357
- "category": "tools",
358
- "test": "Calculator blocks __import__",
359
- "status": "PASS"
360
- },
361
- {
362
- "category": "tools",
363
- "test": "ReadFile blocks /etc/passwd",
364
- "status": "PASS"
365
- },
366
- {
367
- "category": "tools",
368
- "test": "WriteFile blocks /tmp/evil",
369
- "status": "PASS"
370
- },
371
- {
372
- "category": "runmode",
373
- "test": "TRAIN allows write",
374
- "status": "PASS"
375
- },
376
- {
377
- "category": "runmode",
378
- "test": "EVAL blocks write",
379
- "status": "PASS"
380
- },
381
- {
382
- "category": "runmode",
383
- "test": "EVAL is_eval",
384
- "status": "PASS"
385
- },
386
- {
387
- "category": "trace",
388
- "test": "Events recorded",
389
- "status": "PASS"
390
- },
391
- {
392
- "category": "trace",
393
- "test": "JSONL roundtrip",
394
- "status": "PASS"
395
- },
396
- {
397
- "category": "memory",
398
- "test": "7 MemoryKinds",
399
- "status": "PASS"
400
- },
401
- {
402
- "category": "memory",
403
- "test": "5 MemoryStatuses",
404
- "status": "PASS"
405
- },
406
- {
407
- "category": "memory",
408
- "test": "Scoped retrieve",
409
- "status": "PASS"
410
- },
411
- {
412
- "category": "compiler",
413
- "test": "Respects token budget",
414
- "status": "PASS"
415
- },
416
- {
417
- "category": "compiler",
418
- "test": "Returns memory IDs",
419
- "status": "PASS"
420
- },
421
- {
422
- "category": "immune",
423
- "test": "Safe passes",
424
- "status": "PASS"
425
- },
426
- {
427
- "category": "immune",
428
- "test": "Injection blocked",
429
- "status": "PASS"
430
- },
431
- {
432
- "category": "immune",
433
- "test": "Score hack blocked",
434
- "status": "PASS"
435
- },
436
- {
437
- "category": "immune",
438
- "test": "API key blocked",
439
- "status": "PASS"
440
- },
441
- {
442
- "category": "immune",
443
- "test": "Tool misuse blocked",
444
- "status": "PASS"
445
- },
446
- {
447
- "category": "ci",
448
- "test": "Good \u2192 quarantined",
449
- "status": "PASS"
450
- },
451
- {
452
- "category": "ci",
453
- "test": "Promote works",
454
- "status": "PASS"
455
- },
456
- {
457
- "category": "ci",
458
- "test": "Injection \u2192 rejected",
459
- "status": "PASS"
460
- },
461
- {
462
- "category": "agent",
463
- "test": "Agent.run() completes",
464
- "status": "PASS"
465
- },
466
- {
467
- "category": "graph",
468
- "test": "Conditional routing",
469
- "status": "PASS"
470
- },
471
- {
472
- "category": "parallel",
473
- "test": "3 tasks complete",
474
- "status": "PASS"
475
- },
476
- {
477
- "category": "conversation",
478
- "test": "Messages produced",
479
- "status": "PASS"
480
- },
481
- {
482
- "category": "knowledge",
483
- "test": "Chunks stored",
484
- "status": "PASS"
485
- },
486
- {
487
- "category": "knowledge",
488
- "test": "Query returns results",
489
- "status": "PASS"
490
- },
491
- {
492
- "category": "knowledge",
493
- "test": "as_tool() works",
494
- "status": "PASS"
495
- },
496
- {
497
- "category": "easy",
498
- "test": "purpose() auto-detects coding team",
499
- "status": "PASS"
500
- },
501
- {
502
- "category": "easy",
503
- "test": "purpose() auto-detects research team",
504
- "status": "PASS"
505
- },
506
- {
507
- "category": "easy",
508
- "test": "Team.build() works",
509
- "status": "PASS"
510
- },
511
- {
512
- "category": "research",
513
- "test": "MetaRewardingLoop importable",
514
- "status": "PASS"
515
- },
516
- {
517
- "category": "research",
518
- "test": "SelfTaughtEvaluator importable",
519
- "status": "PASS"
520
- },
521
- {
522
- "category": "research",
523
- "test": "PromptOptimizer importable",
524
- "status": "PASS"
525
- },
526
- {
527
- "category": "research",
528
- "test": "LLMCompiler importable",
529
- "status": "PASS"
530
- },
531
- {
532
- "category": "research",
533
- "test": "Retroformer importable",
534
- "status": "PASS"
535
- },
536
- {
537
- "category": "research",
538
- "test": "PromptOptimizer.compile_prompt works",
539
- "status": "PASS"
540
- },
541
- {
542
- "category": "research",
543
- "test": "LLMCompiler plans tasks",
544
- "status": "PASS"
545
- },
546
- {
547
- "category": "research",
548
- "test": "LLMCompiler executes plan",
549
- "status": "PASS"
550
- },
551
- {
552
- "category": "B2-MoH",
553
- "test": "Shared identified",
554
- "status": "PASS"
555
- },
556
- {
557
- "category": "B2-MoH",
558
- "test": "Total K=5 selected",
559
- "status": "PASS"
560
- },
561
- {
562
- "category": "B6-adversarial",
563
- "test": "Catch rate 95%",
564
- "status": "PASS"
565
- },
566
- {
567
- "category": "B6-adversarial",
568
- "test": "FP rate 0%",
569
- "status": "PASS"
570
- },
571
- {
572
- "category": "parser",
573
- "test": "TOML actor parse",
574
- "status": "PASS"
575
- },
576
- {
577
- "category": "parser",
578
- "test": "JSON actor parse",
579
- "status": "PASS"
580
- },
581
- {
582
- "category": "parser",
583
- "test": "TOML critic parse",
584
- "status": "PASS"
585
- },
586
- {
587
- "category": "parser",
588
- "test": "Extract code from markdown",
589
- "status": "PASS"
590
- },
591
- {
592
- "category": "benchmark",
593
- "test": "Improvement curve: [1.0, 10.0, 10.0]",
594
- "status": "PASS"
595
- },
596
- {
597
- "category": "benchmark",
598
- "test": "Heuristics learned: 6",
599
- "status": "PASS"
600
- }
601
- ]
602
- }