clean: remove temp scripts, patches, and dev artifacts before public release
Browse files- TEST_FIXES.md +0 -104
- apply_fixes.py +0 -116
- apply_llm_timeout.py +0 -15
- apply_validate_fix.py +0 -15
- benchmarks/results/track2_report.txt +0 -32
- benchmarks/results/track2_results.json +0 -104
- build_and_publish.py +0 -147
- local_test_with_api.sh +0 -87
- patches/llm_backend_timeout.patch +0 -13
- patches/openai_backend_init.py +0 -15
- patches/validate_mock_fix.patch +0 -11
- run_all_tests.sh +0 -142
- tests/results/launch_readiness.json +0 -602
TEST_FIXES.md
DELETED
|
@@ -1,104 +0,0 @@
|
|
| 1 |
-
# Test Fixes Applied for v3.0.0
|
| 2 |
-
|
| 3 |
-
## Issue 1: Trajectory None guards (FIXED)
|
| 4 |
-
- File: `purpose_agent/types.py` — UPDATED
|
| 5 |
-
- Changed: cumulative_reward, total_delta, success_rate properties now check both `s.score is not None` AND `s.score.delta is not None`
|
| 6 |
-
- Added docstring note that sre_patches.py replaces these at import time
|
| 7 |
-
- Baseline and SRE-patched versions now equivalent
|
| 8 |
-
|
| 9 |
-
## Issue 2: Backpressure test flakiness (NEEDS MANUAL FIX)
|
| 10 |
-
- File: `tests/test_sprint1_events.py` — T1.6 section
|
| 11 |
-
- Problem: async consumer may not start before flooding; terminal event might not arrive
|
| 12 |
-
- Fix: Replace the test_backpressure() function with this more robust version:
|
| 13 |
-
|
| 14 |
-
```python
|
| 15 |
-
async def test_backpressure():
|
| 16 |
-
bus6 = EventBus(max_queue_size=3)
|
| 17 |
-
received = []
|
| 18 |
-
consumer_started = asyncio.Event()
|
| 19 |
-
|
| 20 |
-
async def consumer():
|
| 21 |
-
consumer_started.set()
|
| 22 |
-
try:
|
| 23 |
-
async for event in bus6.subscribe():
|
| 24 |
-
received.append(event)
|
| 25 |
-
await asyncio.sleep(0.01)
|
| 26 |
-
except asyncio.CancelledError:
|
| 27 |
-
pass
|
| 28 |
-
|
| 29 |
-
task = asyncio.create_task(consumer())
|
| 30 |
-
await consumer_started.wait()
|
| 31 |
-
await asyncio.sleep(0.05)
|
| 32 |
-
|
| 33 |
-
for i in range(20):
|
| 34 |
-
bus6.emit(create_event("r6", EventKind.TEXT_DELTA, seq=i, text=f"w{i}"))
|
| 35 |
-
|
| 36 |
-
bus6.emit(create_event("r6", EventKind.RUN_FINISHED, seq=99, result="done"))
|
| 37 |
-
|
| 38 |
-
await asyncio.sleep(1.0)
|
| 39 |
-
bus6.close()
|
| 40 |
-
task.cancel()
|
| 41 |
-
try:
|
| 42 |
-
await asyncio.wait_for(task, timeout=2.0)
|
| 43 |
-
except (asyncio.CancelledError, asyncio.TimeoutError):
|
| 44 |
-
pass
|
| 45 |
-
|
| 46 |
-
has_terminal = any(e.kind == EventKind.RUN_FINISHED for e in received)
|
| 47 |
-
return has_terminal
|
| 48 |
-
```
|
| 49 |
-
|
| 50 |
-
Key changes:
|
| 51 |
-
- Added `consumer_started` Event to ensure consumer is running before flooding
|
| 52 |
-
- Increased final wait from 0.5s to 1.0s
|
| 53 |
-
- Added `asyncio.wait_for` timeout on task cleanup
|
| 54 |
-
|
| 55 |
-
## Issue 3: prod_test.py API timeout (NEEDS MANUAL FIX)
|
| 56 |
-
- File: `tests/prod_test.py`
|
| 57 |
-
- Problem: No timeout on OpenRouter API calls; tests could hang
|
| 58 |
-
- Fix: Wrap the backend creation with a timeout, add retry logic:
|
| 59 |
-
|
| 60 |
-
After line `b = resolve_backend(...)`, add:
|
| 61 |
-
```python
|
| 62 |
-
import signal
|
| 63 |
-
|
| 64 |
-
class TimeoutError(Exception):
|
| 65 |
-
pass
|
| 66 |
-
|
| 67 |
-
def timeout_handler(signum, frame):
|
| 68 |
-
raise TimeoutError("API call timed out")
|
| 69 |
-
|
| 70 |
-
# Set a 60s alarm for API calls
|
| 71 |
-
signal.signal(signal.SIGALRM, timeout_handler)
|
| 72 |
-
```
|
| 73 |
-
|
| 74 |
-
Or simpler: in the resolve_backend call, add timeout to the OpenAI client:
|
| 75 |
-
```python
|
| 76 |
-
# In llm_backend.py OpenAICompatibleBackend.__init__, add:
|
| 77 |
-
self.client = OpenAI(
|
| 78 |
-
base_url=base_url,
|
| 79 |
-
api_key=api_key or os.environ.get("OPENAI_API_KEY"),
|
| 80 |
-
timeout=60.0, # 60 second timeout on all API calls
|
| 81 |
-
)
|
| 82 |
-
```
|
| 83 |
-
|
| 84 |
-
## Issue 4: validate.py mock resilience (NEEDS MANUAL FIX)
|
| 85 |
-
- File: `benchmarks/validate.py`
|
| 86 |
-
- Problem: Mock matches on "Learned Strategies" + "None yet" text; fragile if prompt format changes
|
| 87 |
-
- Fix: In make_mock(), make the heuristic detection more resilient:
|
| 88 |
-
|
| 89 |
-
Change: `has_h = "Learned Strategies" in text and "None yet" not in text`
|
| 90 |
-
To: `has_h = ("Learned Strategies" in text or "Learned Strategies" in text) and "None yet" not in text and "heuristics" in text.lower()`
|
| 91 |
-
|
| 92 |
-
Or better: check the heuristic count directly:
|
| 93 |
-
```python
|
| 94 |
-
has_h = any("When:" in line or "Do:" in line for line in text.split("\n"))
|
| 95 |
-
```
|
| 96 |
-
|
| 97 |
-
## Issue 5: CalculatorTool __import__ blocking (VERIFIED WORKING)
|
| 98 |
-
- File: `purpose_agent/tools.py`
|
| 99 |
-
- CalculatorTool.execute() validates tokens with: `if re.search(r'[a-zA-Z_]', tokens)`
|
| 100 |
-
- After removing known function names (abs, round, sqrt, etc.), any remaining letters are rejected
|
| 101 |
-
- `__import__("os")` → after removing known functions, `__import__` and `os` remain → rejected ✓
|
| 102 |
-
- Also: AST walker checks Call nodes and rejects unknown function names
|
| 103 |
-
- eval() uses `{"__builtins__": {}}` — no builtins available
|
| 104 |
-
- Test in benchmark_v3.py: `check("tools.calc_blocks_import", "Error" in calc.run(expression='__import__("os")').output)` — CORRECT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
apply_fixes.py
DELETED
|
@@ -1,116 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
apply_fixes.py — Apply all 5 test fixes for purpose-agent v3.0.0.
|
| 4 |
-
|
| 5 |
-
Run this AFTER cloning the repo, BEFORE running tests:
|
| 6 |
-
pip install huggingface_hub
|
| 7 |
-
python -c "from huggingface_hub import snapshot_download; snapshot_download('Rohan03/purpose-agent', local_dir='./pa', repo_type='model')"
|
| 8 |
-
cd pa
|
| 9 |
-
python apply_fixes.py
|
| 10 |
-
bash run_all_tests.sh
|
| 11 |
-
"""
|
| 12 |
-
import os
|
| 13 |
-
import re
|
| 14 |
-
|
| 15 |
-
def fix_file(path, description, old, new):
|
| 16 |
-
"""Apply a single string replacement to a file."""
|
| 17 |
-
if not os.path.exists(path):
|
| 18 |
-
print(f" ⚠️ {path} not found — skipping")
|
| 19 |
-
return False
|
| 20 |
-
with open(path, "r") as f:
|
| 21 |
-
content = f.read()
|
| 22 |
-
if old not in content:
|
| 23 |
-
print(f" ⚠️ Pattern not found in {path} — may already be fixed")
|
| 24 |
-
return False
|
| 25 |
-
content = content.replace(old, new, 1)
|
| 26 |
-
with open(path, "w") as f:
|
| 27 |
-
f.write(content)
|
| 28 |
-
print(f" ✅ {description}")
|
| 29 |
-
return True
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
def main():
|
| 33 |
-
print("purpose-agent v3.0.0 — Applying test fixes\n")
|
| 34 |
-
fixes = 0
|
| 35 |
-
|
| 36 |
-
# ═══ Fix 1: Trajectory None guards (already applied in types.py on repo) ═══
|
| 37 |
-
print("Fix 1: Trajectory None guards in types.py")
|
| 38 |
-
f1 = fix_file(
|
| 39 |
-
"purpose_agent/types.py",
|
| 40 |
-
"Updated cumulative_reward to check score.delta is not None",
|
| 41 |
-
old=' @property\n def cumulative_reward(self) -> float:\n """Sum of all positive deltas in the trajectory."""\n return sum(\n s.score.delta for s in self.steps\n if s.score is not None and s.score.delta > 0\n )',
|
| 42 |
-
new=' @property\n def cumulative_reward(self) -> float:\n """Sum of all positive deltas in the trajectory (None-safe)."""\n total = 0.0\n for s in self.steps:\n if s.score is not None and s.score.delta is not None and s.score.delta > 0:\n total += s.score.delta\n return total',
|
| 43 |
-
)
|
| 44 |
-
fixes += int(f1)
|
| 45 |
-
|
| 46 |
-
f1b = fix_file(
|
| 47 |
-
"purpose_agent/types.py",
|
| 48 |
-
"Updated total_delta to check score.delta is not None",
|
| 49 |
-
old=' @property\n def total_delta(self) -> float:\n """Net state improvement across the entire trajectory."""\n return sum(\n s.score.delta for s in self.steps if s.score is not None\n )',
|
| 50 |
-
new=' @property\n def total_delta(self) -> float:\n """Net state improvement across the entire trajectory (None-safe)."""\n total = 0.0\n for s in self.steps:\n if s.score is not None and s.score.delta is not None:\n total += s.score.delta\n return total',
|
| 51 |
-
)
|
| 52 |
-
fixes += int(f1b)
|
| 53 |
-
|
| 54 |
-
f1c = fix_file(
|
| 55 |
-
"purpose_agent/types.py",
|
| 56 |
-
"Updated success_rate to check score.delta is not None",
|
| 57 |
-
old=' scored = [s for s in self.steps if s.score is not None]\n if not scored:\n return 0.0\n return sum(1 for s in scored if s.score.improved) / len(scored)',
|
| 58 |
-
new=' scored = [s for s in self.steps if s.score is not None and s.score.delta is not None]\n if not scored:\n return 0.0\n return sum(1 for s in scored if s.score.improved) / len(scored)',
|
| 59 |
-
)
|
| 60 |
-
fixes += int(f1c)
|
| 61 |
-
|
| 62 |
-
# ═══ Fix 2: Backpressure test robustness (already applied in repo) ═══
|
| 63 |
-
print("\nFix 2: Backpressure test T1.6 in test_sprint1_events.py")
|
| 64 |
-
f2 = fix_file(
|
| 65 |
-
"tests/test_sprint1_events.py",
|
| 66 |
-
"Added consumer_started Event sync to backpressure test",
|
| 67 |
-
old=' bus6 = EventBus(max_queue_size=3) # Very small queue\n\n received = []\n\n async def consumer():\n async for event in bus6.subscribe():\n received.append(event)\n await asyncio.sleep(0.01) # Slow consumer\n\n # Start consumer\n task = asyncio.create_task(consumer())\n await asyncio.sleep(0.05)',
|
| 68 |
-
new=' bus6 = EventBus(max_queue_size=3)\n received = []\n consumer_started = asyncio.Event()\n\n async def consumer():\n consumer_started.set()\n try:\n async for event in bus6.subscribe():\n received.append(event)\n await asyncio.sleep(0.01)\n except asyncio.CancelledError:\n pass\n\n task = asyncio.create_task(consumer())\n await consumer_started.wait()\n await asyncio.sleep(0.05)',
|
| 69 |
-
)
|
| 70 |
-
fixes += int(f2)
|
| 71 |
-
|
| 72 |
-
f2b = fix_file(
|
| 73 |
-
"tests/test_sprint1_events.py",
|
| 74 |
-
"Added longer wait and wait_for on task cleanup",
|
| 75 |
-
old=' await asyncio.sleep(0.5)\n bus6.close()\n task.cancel()\n try:\n await task\n except asyncio.CancelledError:\n pass',
|
| 76 |
-
new=' await asyncio.sleep(1.0)\n bus6.close()\n task.cancel()\n try:\n await asyncio.wait_for(task, timeout=2.0)\n except (asyncio.CancelledError, asyncio.TimeoutError):\n pass',
|
| 77 |
-
)
|
| 78 |
-
fixes += int(f2b)
|
| 79 |
-
|
| 80 |
-
# ═══ Fix 3: OpenAI backend timeout (already applied in prod_test.py) ═══
|
| 81 |
-
print("\nFix 3: Add 60s timeout to OpenAI-compatible backend")
|
| 82 |
-
f3 = fix_file(
|
| 83 |
-
"purpose_agent/llm_backend.py",
|
| 84 |
-
"Added timeout parameter to OpenAICompatibleBackend",
|
| 85 |
-
old=' def __init__(\n self,\n model: str = "gpt-4o",\n base_url: str | None = None,\n api_key: str | None = None,\n ):\n from openai import OpenAI\n\n self.model = model\n self.client = OpenAI(\n base_url=base_url,\n api_key=api_key or os.environ.get("OPENAI_API_KEY"),\n )',
|
| 86 |
-
new=' def __init__(\n self,\n model: str = "gpt-4o",\n base_url: str | None = None,\n api_key: str | None = None,\n timeout: float = 60.0,\n ):\n from openai import OpenAI\n\n self.model = model\n self.client = OpenAI(\n base_url=base_url,\n api_key=api_key or os.environ.get("OPENAI_API_KEY"),\n timeout=timeout,\n )',
|
| 87 |
-
)
|
| 88 |
-
fixes += int(f3)
|
| 89 |
-
|
| 90 |
-
# ═══ Fix 4: validate.py mock heuristic detection ═══
|
| 91 |
-
print("\nFix 4: Make mock heuristic detection more resilient")
|
| 92 |
-
f4 = fix_file(
|
| 93 |
-
"benchmarks/validate.py",
|
| 94 |
-
"Broadened heuristic detection from exact string to include 'When:' pattern",
|
| 95 |
-
old=' has_h = "Learned Strategies" in text and "None yet" not in text',
|
| 96 |
-
new=' has_h = ("Learned Strategies" in text or "When:" in text) and "None yet" not in text',
|
| 97 |
-
)
|
| 98 |
-
fixes += int(f4)
|
| 99 |
-
|
| 100 |
-
# ═══ Fix 5: CalculatorTool verification (no code change needed) ═══
|
| 101 |
-
print("\nFix 5: CalculatorTool __import__ blocking — VERIFIED (no change needed)")
|
| 102 |
-
print(" ✅ CalculatorTool.execute() rejects letters after removing known functions")
|
| 103 |
-
print(" ✅ AST walker rejects unknown function calls")
|
| 104 |
-
print(" ✅ eval() uses empty __builtins__")
|
| 105 |
-
print(" ✅ benchmark_v3.py test: 'Error' in calc.run('__import__(\"os\")').output")
|
| 106 |
-
|
| 107 |
-
# ═══ Summary ═══
|
| 108 |
-
print(f"\n{'='*50}")
|
| 109 |
-
print(f" Fixes applied: {fixes}")
|
| 110 |
-
print(f" Fix 5: Verified (no change needed)")
|
| 111 |
-
print(f"{'='*50}")
|
| 112 |
-
print("\n Next: bash run_all_tests.sh")
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
if __name__ == "__main__":
|
| 116 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
apply_llm_timeout.py
DELETED
|
@@ -1,15 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""Apply the OpenAI backend timeout fix directly."""
|
| 3 |
-
import os
|
| 4 |
-
path = os.path.join(os.path.dirname(__file__), "purpose_agent", "llm_backend.py")
|
| 5 |
-
with open(path, "r") as f:
|
| 6 |
-
content = f.read()
|
| 7 |
-
old = ' def __init__(\n self,\n model: str = "gpt-4o",\n base_url: str | None = None,\n api_key: str | None = None,\n ):\n from openai import OpenAI\n\n self.model = model\n self.client = OpenAI(\n base_url=base_url,\n api_key=api_key or os.environ.get("OPENAI_API_KEY"),\n )'
|
| 8 |
-
new = ' def __init__(\n self,\n model: str = "gpt-4o",\n base_url: str | None = None,\n api_key: str | None = None,\n timeout: float = 60.0,\n ):\n from openai import OpenAI\n\n self.model = model\n self.client = OpenAI(\n base_url=base_url,\n api_key=api_key or os.environ.get("OPENAI_API_KEY"),\n timeout=timeout,\n )'
|
| 9 |
-
if old in content:
|
| 10 |
-
content = content.replace(old, new, 1)
|
| 11 |
-
with open(path, "w") as f:
|
| 12 |
-
f.write(content)
|
| 13 |
-
print("✅ Applied OpenAI backend timeout fix (60s)")
|
| 14 |
-
else:
|
| 15 |
-
print("⚠️ Pattern not found — may already be patched")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
apply_validate_fix.py
DELETED
|
@@ -1,15 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""Apply the validate.py mock heuristic detection fix."""
|
| 3 |
-
import os
|
| 4 |
-
path = os.path.join(os.path.dirname(__file__), "benchmarks", "validate.py")
|
| 5 |
-
with open(path, "r") as f:
|
| 6 |
-
content = f.read()
|
| 7 |
-
old = ' has_h = "Learned Strategies" in text and "None yet" not in text'
|
| 8 |
-
new = ' has_h = ("Learned Strategies" in text or "When:" in text) and "None yet" not in text'
|
| 9 |
-
if old in content:
|
| 10 |
-
content = content.replace(old, new, 1)
|
| 11 |
-
with open(path, "w") as f:
|
| 12 |
-
f.write(content)
|
| 13 |
-
print("✅ Applied validate.py heuristic detection fix")
|
| 14 |
-
else:
|
| 15 |
-
print("⚠️ Pattern not found — may already be patched")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmarks/results/track2_report.txt
DELETED
|
@@ -1,32 +0,0 @@
|
|
| 1 |
-
╔════════════════════════════════════════════════════╗
|
| 2 |
-
║ Purpose Agent — Track 2 Validation Report ║
|
| 3 |
-
╚════════════════════════════════════════════════════╝
|
| 4 |
-
|
| 5 |
-
═══ Improvement Curves ═══
|
| 6 |
-
Task Run Steps Φ Pass% Heur
|
| 7 |
-
────────────────────────────────────────────────
|
| 8 |
-
fibonacci 1 2 5.0 50% 3
|
| 9 |
-
fibonacci 2 1 5.0 100% 9
|
| 10 |
-
fibonacci 3 1 5.0 100% 18
|
| 11 |
-
→ Δ(Φ) = +0.0 (no change)
|
| 12 |
-
|
| 13 |
-
factorial 1 2 5.0 0% 3
|
| 14 |
-
factorial 2 1 5.0 100% 9
|
| 15 |
-
factorial 3 1 5.0 100% 18
|
| 16 |
-
→ Δ(Φ) = +0.0 (no change)
|
| 17 |
-
|
| 18 |
-
═══ Cold vs Warm ═══
|
| 19 |
-
fibonacci cold=5.0 warm=5.0 Δ=+0.0
|
| 20 |
-
factorial cold=5.0 warm=5.0 Δ=+0.0
|
| 21 |
-
|
| 22 |
-
═══ Cross-Task Transfer (['fibonacci', 'factorial'] → ['palindrome', 'fizzbuzz']) ═══
|
| 23 |
-
30 heuristics transferred
|
| 24 |
-
palindrome: ✓ Φ=5.0
|
| 25 |
-
fizzbuzz: ✓ Φ=5.0
|
| 26 |
-
|
| 27 |
-
═══ Adversarial Robustness: 100% (8/8) ═══
|
| 28 |
-
|
| 29 |
-
═══ VERDICT ═══
|
| 30 |
-
✗ Self-improvement: NOT demonstrated
|
| 31 |
-
✗ Cold/warm: no benefit from memory
|
| 32 |
-
✓ Immune system: 100% adversarial accuracy
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmarks/results/track2_results.json
DELETED
|
@@ -1,104 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"curves": {
|
| 3 |
-
"fibonacci": [
|
| 4 |
-
{
|
| 5 |
-
"run": 1,
|
| 6 |
-
"steps": 2,
|
| 7 |
-
"phi": 5.0,
|
| 8 |
-
"pass_rate": 0.5,
|
| 9 |
-
"all_passed": false,
|
| 10 |
-
"heuristics": 3,
|
| 11 |
-
"time": 0.01
|
| 12 |
-
},
|
| 13 |
-
{
|
| 14 |
-
"run": 2,
|
| 15 |
-
"steps": 1,
|
| 16 |
-
"phi": 5.0,
|
| 17 |
-
"pass_rate": 1.0,
|
| 18 |
-
"all_passed": true,
|
| 19 |
-
"heuristics": 9,
|
| 20 |
-
"time": 0.0
|
| 21 |
-
},
|
| 22 |
-
{
|
| 23 |
-
"run": 3,
|
| 24 |
-
"steps": 1,
|
| 25 |
-
"phi": 5.0,
|
| 26 |
-
"pass_rate": 1.0,
|
| 27 |
-
"all_passed": true,
|
| 28 |
-
"heuristics": 18,
|
| 29 |
-
"time": 0.0
|
| 30 |
-
}
|
| 31 |
-
],
|
| 32 |
-
"factorial": [
|
| 33 |
-
{
|
| 34 |
-
"run": 1,
|
| 35 |
-
"steps": 2,
|
| 36 |
-
"phi": 5.0,
|
| 37 |
-
"pass_rate": 0.0,
|
| 38 |
-
"all_passed": false,
|
| 39 |
-
"heuristics": 3,
|
| 40 |
-
"time": 0.0
|
| 41 |
-
},
|
| 42 |
-
{
|
| 43 |
-
"run": 2,
|
| 44 |
-
"steps": 1,
|
| 45 |
-
"phi": 5.0,
|
| 46 |
-
"pass_rate": 1.0,
|
| 47 |
-
"all_passed": true,
|
| 48 |
-
"heuristics": 9,
|
| 49 |
-
"time": 0.0
|
| 50 |
-
},
|
| 51 |
-
{
|
| 52 |
-
"run": 3,
|
| 53 |
-
"steps": 1,
|
| 54 |
-
"phi": 5.0,
|
| 55 |
-
"pass_rate": 1.0,
|
| 56 |
-
"all_passed": true,
|
| 57 |
-
"heuristics": 18,
|
| 58 |
-
"time": 0.0
|
| 59 |
-
}
|
| 60 |
-
]
|
| 61 |
-
},
|
| 62 |
-
"cold_warm": [
|
| 63 |
-
{
|
| 64 |
-
"task": "fibonacci",
|
| 65 |
-
"cold_phi": 5.0,
|
| 66 |
-
"warm_phi": 5.0,
|
| 67 |
-
"delta": 0.0,
|
| 68 |
-
"improved": false
|
| 69 |
-
},
|
| 70 |
-
{
|
| 71 |
-
"task": "factorial",
|
| 72 |
-
"cold_phi": 5.0,
|
| 73 |
-
"warm_phi": 5.0,
|
| 74 |
-
"delta": 0.0,
|
| 75 |
-
"improved": false
|
| 76 |
-
}
|
| 77 |
-
],
|
| 78 |
-
"transfer": {
|
| 79 |
-
"train": [
|
| 80 |
-
"fibonacci",
|
| 81 |
-
"factorial"
|
| 82 |
-
],
|
| 83 |
-
"test": [
|
| 84 |
-
"palindrome",
|
| 85 |
-
"fizzbuzz"
|
| 86 |
-
],
|
| 87 |
-
"heuristics": 30,
|
| 88 |
-
"results": {
|
| 89 |
-
"palindrome": {
|
| 90 |
-
"phi": 5.0,
|
| 91 |
-
"passed": true
|
| 92 |
-
},
|
| 93 |
-
"fizzbuzz": {
|
| 94 |
-
"phi": 5.0,
|
| 95 |
-
"passed": true
|
| 96 |
-
}
|
| 97 |
-
}
|
| 98 |
-
},
|
| 99 |
-
"adversarial": {
|
| 100 |
-
"total": 8,
|
| 101 |
-
"correct": 8,
|
| 102 |
-
"accuracy": 1.0
|
| 103 |
-
}
|
| 104 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
build_and_publish.py
DELETED
|
@@ -1,147 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
build_and_publish.py — Build purpose-agent v3.0.0 and publish to PyPI.
|
| 4 |
-
|
| 5 |
-
Prerequisites:
|
| 6 |
-
pip install build twine
|
| 7 |
-
|
| 8 |
-
Usage:
|
| 9 |
-
python build_and_publish.py # Build + publish
|
| 10 |
-
python build_and_publish.py --build-only # Build only, don't publish
|
| 11 |
-
python build_and_publish.py --check # Build + twine check, don't publish
|
| 12 |
-
|
| 13 |
-
Environment:
|
| 14 |
-
PYPI_TOKEN — PyPI API token (or pass as argument)
|
| 15 |
-
"""
|
| 16 |
-
import os
|
| 17 |
-
import sys
|
| 18 |
-
import shutil
|
| 19 |
-
import subprocess
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
def run(cmd, **kwargs):
|
| 23 |
-
"""Run a command and check for errors."""
|
| 24 |
-
print(f" $ {cmd}")
|
| 25 |
-
result = subprocess.run(cmd, shell=True, **kwargs)
|
| 26 |
-
if result.returncode != 0:
|
| 27 |
-
print(f" ❌ Command failed with exit code {result.returncode}")
|
| 28 |
-
sys.exit(1)
|
| 29 |
-
return result
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
def main():
|
| 33 |
-
build_only = "--build-only" in sys.argv
|
| 34 |
-
check_only = "--check" in sys.argv
|
| 35 |
-
|
| 36 |
-
# Get PyPI token
|
| 37 |
-
pypi_token = os.environ.get("PYPI_TOKEN", "")
|
| 38 |
-
if not build_only and not check_only:
|
| 39 |
-
# Try to get from command line args
|
| 40 |
-
for arg in sys.argv:
|
| 41 |
-
if arg.startswith("pypi-"):
|
| 42 |
-
pypi_token = arg
|
| 43 |
-
break
|
| 44 |
-
if not pypi_token:
|
| 45 |
-
print("⚠️ No PyPI token found. Set PYPI_TOKEN env var or pass as argument.")
|
| 46 |
-
print(" Usage: python build_and_publish.py pypi-AgE...")
|
| 47 |
-
if not build_only:
|
| 48 |
-
sys.exit(1)
|
| 49 |
-
|
| 50 |
-
# Verify version
|
| 51 |
-
print("\n═══ Step 1: Verify version ═══")
|
| 52 |
-
sys.path.insert(0, ".")
|
| 53 |
-
import purpose_agent
|
| 54 |
-
version = purpose_agent.__version__
|
| 55 |
-
print(f" Version: {version}")
|
| 56 |
-
if version != "3.0.0":
|
| 57 |
-
print(f" ❌ Version is {version}, expected 3.0.0!")
|
| 58 |
-
sys.exit(1)
|
| 59 |
-
print(f" ✅ Version confirmed: {version}")
|
| 60 |
-
|
| 61 |
-
# Verify imports
|
| 62 |
-
print("\n═══ Step 2: Verify imports ═══")
|
| 63 |
-
missing = [n for n in purpose_agent.__all__ if not hasattr(purpose_agent, n)]
|
| 64 |
-
if missing:
|
| 65 |
-
print(f" ❌ Missing exports: {missing}")
|
| 66 |
-
sys.exit(1)
|
| 67 |
-
print(f" ✅ All {len(purpose_agent.__all__)} exports importable")
|
| 68 |
-
|
| 69 |
-
# Clean old builds
|
| 70 |
-
print("\n═══ Step 3: Clean old builds ═══")
|
| 71 |
-
for path in ["dist", "build", "*.egg-info"]:
|
| 72 |
-
if os.path.exists(path):
|
| 73 |
-
shutil.rmtree(path)
|
| 74 |
-
print(f" Removed: {path}")
|
| 75 |
-
# Also clean any .egg-info in current dir
|
| 76 |
-
for item in os.listdir("."):
|
| 77 |
-
if item.endswith(".egg-info"):
|
| 78 |
-
shutil.rmtree(item)
|
| 79 |
-
print(f" Removed: {item}")
|
| 80 |
-
print(" ✅ Cleaned")
|
| 81 |
-
|
| 82 |
-
# Build
|
| 83 |
-
print("\n═══ Step 4: Build sdist + wheel ═══")
|
| 84 |
-
run("python -m build")
|
| 85 |
-
|
| 86 |
-
# List artifacts
|
| 87 |
-
print("\n Artifacts:")
|
| 88 |
-
if os.path.exists("dist"):
|
| 89 |
-
for f in os.listdir("dist"):
|
| 90 |
-
size = os.path.getsize(os.path.join("dist", f))
|
| 91 |
-
print(f" {f} ({size:,} bytes)")
|
| 92 |
-
|
| 93 |
-
# Twine check
|
| 94 |
-
print("\n═══ Step 5: Twine check ═══")
|
| 95 |
-
run("twine check dist/*")
|
| 96 |
-
|
| 97 |
-
if check_only:
|
| 98 |
-
print("\n ✅ Build + check complete. Not publishing (use without --check).")
|
| 99 |
-
return
|
| 100 |
-
|
| 101 |
-
if build_only:
|
| 102 |
-
print("\n ✅ Build complete. Not publishing (use without --build-only).")
|
| 103 |
-
return
|
| 104 |
-
|
| 105 |
-
# Publish
|
| 106 |
-
print("\n═══ Step 6: Publish to PyPI ═══")
|
| 107 |
-
print(f" Package: purpose-agent=={version}")
|
| 108 |
-
print(f" Target: https://pypi.org/project/purpose-agent/")
|
| 109 |
-
print()
|
| 110 |
-
|
| 111 |
-
run(
|
| 112 |
-
f'twine upload dist/* '
|
| 113 |
-
f'--username __token__ '
|
| 114 |
-
f'--password "{pypi_token}"'
|
| 115 |
-
)
|
| 116 |
-
|
| 117 |
-
# Verify
|
| 118 |
-
print("\n═══ Step 7: Verify on PyPI ═══")
|
| 119 |
-
print(f" Waiting 10s for PyPI to index...")
|
| 120 |
-
subprocess.run(["sleep", "10"])
|
| 121 |
-
|
| 122 |
-
print(f" Installing from PyPI...")
|
| 123 |
-
run(f'pip install purpose-agent=={version}')
|
| 124 |
-
|
| 125 |
-
result = subprocess.run(
|
| 126 |
-
['python', '-c', f'''
|
| 127 |
-
import purpose_agent as pa
|
| 128 |
-
print(f" v{{pa.__version__}} — {{len(pa.__all__)}} exports")
|
| 129 |
-
assert pa.__version__ == "{version}", f"Version mismatch: {{pa.__version__}}"
|
| 130 |
-
'''],
|
| 131 |
-
capture_output=True, text=True,
|
| 132 |
-
)
|
| 133 |
-
print(result.stdout)
|
| 134 |
-
if result.returncode != 0:
|
| 135 |
-
print(f" ⚠️ Verification failed: {result.stderr}")
|
| 136 |
-
else:
|
| 137 |
-
print(" ✅ PyPI install verified!")
|
| 138 |
-
|
| 139 |
-
print(f"\n{'='*60}")
|
| 140 |
-
print(f" ✅ purpose-agent=={version} PUBLISHED TO PYPI!")
|
| 141 |
-
print(f" 📦 https://pypi.org/project/purpose-agent/{version}/")
|
| 142 |
-
print(f" 📦 https://huggingface.co/Rohan03/purpose-agent")
|
| 143 |
-
print(f"{'='*60}")
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
if __name__ == "__main__":
|
| 147 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
local_test_with_api.sh
DELETED
|
@@ -1,87 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env bash
|
| 2 |
-
# ═══════════════════════════════════════════════════════════════
|
| 3 |
-
# purpose-agent v3.0.0 — Local Test Runner (including cloud model)
|
| 4 |
-
#
|
| 5 |
-
# Run on your own machine (no HF credits needed):
|
| 6 |
-
# git clone https://huggingface.co/Rohan03/purpose-agent pa && cd pa
|
| 7 |
-
# pip install -e .
|
| 8 |
-
# export OPENROUTER_API_KEY="sk-or-v1-..."
|
| 9 |
-
# bash local_test_with_api.sh
|
| 10 |
-
# ═══════════════════════════════════════════════════════════════
|
| 11 |
-
set -e
|
| 12 |
-
|
| 13 |
-
PASS=0; FAIL=0; TOTAL=0
|
| 14 |
-
|
| 15 |
-
run_test() {
|
| 16 |
-
local name="$1"; local cmd="$2"
|
| 17 |
-
echo ""
|
| 18 |
-
echo "══════════════════════════════════════════════════════════"
|
| 19 |
-
echo " Running: $name"
|
| 20 |
-
echo "══════════════════════════════════════════════════════════"
|
| 21 |
-
if eval "$cmd"; then
|
| 22 |
-
echo " ✅ $name PASSED"; PASS=$((PASS+1))
|
| 23 |
-
else
|
| 24 |
-
echo " ❌ $name FAILED"; FAIL=$((FAIL+1))
|
| 25 |
-
fi
|
| 26 |
-
TOTAL=$((TOTAL+1))
|
| 27 |
-
}
|
| 28 |
-
|
| 29 |
-
echo "╔══════════════════════════════════════════════════════════╗"
|
| 30 |
-
echo "║ purpose-agent v3.0.0 — Full Test Suite (Local + Cloud)║"
|
| 31 |
-
echo "╚══════════════════════════════════════════════════════════╝"
|
| 32 |
-
|
| 33 |
-
# Pre-flight
|
| 34 |
-
echo "═══ Pre-flight ═══"
|
| 35 |
-
python -c "
|
| 36 |
-
import purpose_agent as pa
|
| 37 |
-
print(f' v{pa.__version__} — {len(pa.__all__)} exports')
|
| 38 |
-
assert pa.__version__ == '3.0.0', f'Version: {pa.__version__}'
|
| 39 |
-
assert len(pa.__all__) >= 110, f'Exports: {len(pa.__all__)}'
|
| 40 |
-
missing = [n for n in pa.__all__ if not hasattr(pa, n)]
|
| 41 |
-
assert not missing, f'Missing: {missing}'
|
| 42 |
-
print(' ✅ All exports importable')
|
| 43 |
-
"
|
| 44 |
-
|
| 45 |
-
# Apply fixes
|
| 46 |
-
echo ""
|
| 47 |
-
echo "═══ Applying test fixes ═══"
|
| 48 |
-
python apply_fixes.py
|
| 49 |
-
|
| 50 |
-
# Layer 1: Unit Tests
|
| 51 |
-
run_test "test_core" "python tests/test_core.py"
|
| 52 |
-
run_test "test_public_api_211" "python tests/compat/test_public_api_211.py"
|
| 53 |
-
run_test "test_first_principles" "python tests/test_first_principles.py"
|
| 54 |
-
run_test "test_hardening" "python tests/test_hardening.py"
|
| 55 |
-
run_test "test_sre_regression" "python tests/test_sre_regression.py"
|
| 56 |
-
|
| 57 |
-
# Layer 2: Feature Tests
|
| 58 |
-
run_test "test_sprint1_events" "python tests/test_sprint1_events.py"
|
| 59 |
-
run_test "test_sprint2_checkpoint" "python tests/test_sprint2_checkpoint.py"
|
| 60 |
-
run_test "test_sprint3_homeostasis" "python tests/test_sprint3_homeostasis.py"
|
| 61 |
-
run_test "test_sprint4_8_protocols" "python tests/test_sprint4_8_protocols.py"
|
| 62 |
-
run_test "test_track_c" "python tests/test_track_c.py"
|
| 63 |
-
run_test "test_track_d" "python tests/test_track_d.py"
|
| 64 |
-
|
| 65 |
-
# Layer 3: Integration
|
| 66 |
-
run_test "validate_quick" "python benchmarks/validate.py --quick"
|
| 67 |
-
run_test "benchmark_v3" "python -m purpose_agent.benchmark_v3"
|
| 68 |
-
|
| 69 |
-
# Layer 4: Cloud Model Tests
|
| 70 |
-
if [ -n "$OPENROUTER_API_KEY" ]; then
|
| 71 |
-
run_test "prod_test (real model)" "python tests/prod_test.py"
|
| 72 |
-
else
|
| 73 |
-
echo " ⚠️ OPENROUTER_API_KEY not set — skipping cloud tests"
|
| 74 |
-
fi
|
| 75 |
-
|
| 76 |
-
# Report
|
| 77 |
-
echo ""
|
| 78 |
-
echo "╔══════════════════════════════════════════════════════════╗"
|
| 79 |
-
echo "║ RESULTS: $PASS/$TOTAL passed, $FAIL failed"
|
| 80 |
-
if [ $FAIL -eq 0 ]; then
|
| 81 |
-
echo "║ ✅ ALL TESTS PASSED — READY TO PUBLISH"
|
| 82 |
-
echo "║ Next: python build_and_publish.py"
|
| 83 |
-
else
|
| 84 |
-
echo "║ ❌ $FAIL FAILURES — FIX BEFORE PUBLISHING"
|
| 85 |
-
fi
|
| 86 |
-
echo "╚══════════════════════════════════════════════════════════╝"
|
| 87 |
-
exit $FAIL
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
patches/llm_backend_timeout.patch
DELETED
|
@@ -1,13 +0,0 @@
|
|
| 1 |
-
--- a/purpose_agent/llm_backend.py
|
| 2 |
-
+++ b/purpose_agent/llm_backend.py
|
| 3 |
-
@@ -199,7 +199,7 @@
|
| 4 |
-
):
|
| 5 |
-
from openai import OpenAI
|
| 6 |
-
|
| 7 |
-
self.model = model
|
| 8 |
-
self.client = OpenAI(
|
| 9 |
-
base_url=base_url,
|
| 10 |
-
- api_key=api_key or os.environ.get("OPENAI_API_KEY"),
|
| 11 |
-
+ api_key=api_key or os.environ.get("OPENAI_API_KEY"),
|
| 12 |
-
+ timeout=60.0,
|
| 13 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
patches/openai_backend_init.py
DELETED
|
@@ -1,15 +0,0 @@
|
|
| 1 |
-
def __init__(
|
| 2 |
-
self,
|
| 3 |
-
model: str = "gpt-4o",
|
| 4 |
-
base_url: str | None = None,
|
| 5 |
-
api_key: str | None = None,
|
| 6 |
-
timeout: float = 60.0,
|
| 7 |
-
):
|
| 8 |
-
from openai import OpenAI
|
| 9 |
-
|
| 10 |
-
self.model = model
|
| 11 |
-
self.client = OpenAI(
|
| 12 |
-
base_url=base_url,
|
| 13 |
-
api_key=api_key or os.environ.get("OPENAI_API_KEY"),
|
| 14 |
-
timeout=timeout,
|
| 15 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
patches/validate_mock_fix.patch
DELETED
|
@@ -1,11 +0,0 @@
|
|
| 1 |
-
--- a/benchmarks/validate.py
|
| 2 |
-
+++ b/benchmarks/validate.py
|
| 3 |
-
@@ -107,7 +107,7 @@
|
| 4 |
-
def make_mock(task_name):
|
| 5 |
-
mock = MockLLMBackend()
|
| 6 |
-
t = TASKS[task_name]
|
| 7 |
-
def actor(msgs):
|
| 8 |
-
text = " ".join(m.content for m in msgs)
|
| 9 |
-
- has_h = "Learned Strategies" in text and "None yet" not in text
|
| 10 |
-
+ has_h = ("Learned Strategies" in text or "When:" in text) and "None yet" not in text
|
| 11 |
-
code = t["good"] if has_h else t["bad"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
run_all_tests.sh
DELETED
|
@@ -1,142 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env bash
|
| 2 |
-
# ═══════════════════════════════════════════════════════════════
|
| 3 |
-
# purpose-agent v3.0.0 — Complete Test Suite Runner
|
| 4 |
-
#
|
| 5 |
-
# Usage:
|
| 6 |
-
# chmod +x run_all_tests.sh
|
| 7 |
-
# ./run_all_tests.sh # All tests (mock only)
|
| 8 |
-
# ./run_all_tests.sh --prod # Include real model tests (needs OPENROUTER_API_KEY)
|
| 9 |
-
# ═══════════════════════════════════════════════════════════════
|
| 10 |
-
set -e
|
| 11 |
-
|
| 12 |
-
PASS=0
|
| 13 |
-
FAIL=0
|
| 14 |
-
TOTAL=0
|
| 15 |
-
|
| 16 |
-
run_test() {
|
| 17 |
-
local name="$1"
|
| 18 |
-
local cmd="$2"
|
| 19 |
-
echo ""
|
| 20 |
-
echo "══════════════════════════════════════════════════════════"
|
| 21 |
-
echo " Running: $name"
|
| 22 |
-
echo "══════════════════════════════════════════════════════════"
|
| 23 |
-
|
| 24 |
-
if eval "$cmd"; then
|
| 25 |
-
echo " ✅ $name PASSED"
|
| 26 |
-
PASS=$((PASS + 1))
|
| 27 |
-
else
|
| 28 |
-
echo " ❌ $name FAILED"
|
| 29 |
-
FAIL=$((FAIL + 1))
|
| 30 |
-
fi
|
| 31 |
-
TOTAL=$((TOTAL + 1))
|
| 32 |
-
}
|
| 33 |
-
|
| 34 |
-
echo "╔══════════════════════════════════════════════════════════╗"
|
| 35 |
-
echo "║ purpose-agent v3.0.0 — Complete Test Suite ║"
|
| 36 |
-
echo "╚══════════════════════════════════════════════════════════╝"
|
| 37 |
-
echo ""
|
| 38 |
-
|
| 39 |
-
# ── Pre-flight: verify package imports ──
|
| 40 |
-
echo "═══ Pre-flight: Package Import Check ═══"
|
| 41 |
-
python -c "
|
| 42 |
-
import purpose_agent as pa
|
| 43 |
-
print(f' v{pa.__version__} — {len(pa.__all__)} exports')
|
| 44 |
-
assert pa.__version__ == '3.0.0', f'Version mismatch: {pa.__version__}'
|
| 45 |
-
assert len(pa.__all__) >= 110, f'Not enough exports: {len(pa.__all__)}'
|
| 46 |
-
missing = [n for n in pa.__all__ if not hasattr(pa, n)]
|
| 47 |
-
assert len(missing) == 0, f'Missing exports: {missing}'
|
| 48 |
-
print(' ✅ All exports importable')
|
| 49 |
-
"
|
| 50 |
-
|
| 51 |
-
# ═══════════════════════════════════════════════════════════════
|
| 52 |
-
# LAYER 1: Unit Tests
|
| 53 |
-
# ═══════════════════════════════════════════════════════════════
|
| 54 |
-
|
| 55 |
-
run_test "test_core (basic loop, Φ bounds, optimizer, replay, immune)" \
|
| 56 |
-
"python tests/test_core.py"
|
| 57 |
-
|
| 58 |
-
run_test "test_public_api_211 (all 120+ exports, Level 1/2/3)" \
|
| 59 |
-
"python tests/compat/test_public_api_211.py"
|
| 60 |
-
|
| 61 |
-
run_test "test_first_principles (state-delta O(1), falsification, PEP 578)" \
|
| 62 |
-
"python tests/test_first_principles.py"
|
| 63 |
-
|
| 64 |
-
run_test "test_hardening (null safety, timeouts, validation)" \
|
| 65 |
-
"python tests/test_hardening.py"
|
| 66 |
-
|
| 67 |
-
run_test "test_sre_regression (5 critical vulnerability scenarios)" \
|
| 68 |
-
"python tests/test_sre_regression.py"
|
| 69 |
-
|
| 70 |
-
# ═══════════════════════════════════════════════════════════════
|
| 71 |
-
# LAYER 2: Feature Tests
|
| 72 |
-
# ═══════════════════════════════════════════════════════════════
|
| 73 |
-
|
| 74 |
-
run_test "test_sprint1_events (event bus, lanes, CoT rejection)" \
|
| 75 |
-
"python tests/test_sprint1_events.py"
|
| 76 |
-
|
| 77 |
-
run_test "test_sprint2_checkpoint (durable execution, resume, idempotency)" \
|
| 78 |
-
"python tests/test_sprint2_checkpoint.py"
|
| 79 |
-
|
| 80 |
-
run_test "test_sprint3_homeostasis (memory budget, consolidation, hibernation)" \
|
| 81 |
-
"python tests/test_sprint3_homeostasis.py"
|
| 82 |
-
|
| 83 |
-
run_test "test_sprint4_8_protocols (MCP, A2A, AG-UI, AGENTS.md, quorum)" \
|
| 84 |
-
"python tests/test_sprint4_8_protocols.py"
|
| 85 |
-
|
| 86 |
-
run_test "test_track_c (routing, MAS generator, skills)" \
|
| 87 |
-
"python tests/test_track_c.py"
|
| 88 |
-
|
| 89 |
-
run_test "test_track_d (fingerprint, dataset, prompt pack, optimizer, distillation)" \
|
| 90 |
-
"python tests/test_track_d.py"
|
| 91 |
-
|
| 92 |
-
# ═══════════════════════════════════════════════════════════════
|
| 93 |
-
# LAYER 3: Integration Tests
|
| 94 |
-
# ═══════════════════════════════════════════════════════════════
|
| 95 |
-
|
| 96 |
-
run_test "validate.py --quick (improvement curves + adversarial)" \
|
| 97 |
-
"python benchmarks/validate.py --quick"
|
| 98 |
-
|
| 99 |
-
run_test "benchmark_v3 (35+ robustness checks across all subsystems)" \
|
| 100 |
-
"python -m purpose_agent.benchmark_v3"
|
| 101 |
-
|
| 102 |
-
# ═══════════════════════════════════════════════════════════════
|
| 103 |
-
# LAYER 4: Production Tests (optional — needs API key)
|
| 104 |
-
# ═══════════════════════════════════════════════════════════════
|
| 105 |
-
|
| 106 |
-
if [ "$1" = "--prod" ]; then
|
| 107 |
-
if [ -z "$OPENROUTER_API_KEY" ]; then
|
| 108 |
-
echo "⚠️ OPENROUTER_API_KEY not set — skipping prod tests"
|
| 109 |
-
echo " Set it with: export OPENROUTER_API_KEY=sk-or-v1-..."
|
| 110 |
-
else
|
| 111 |
-
run_test "prod_test (real model Level 1/2/3 + coding + security)" \
|
| 112 |
-
"python tests/prod_test.py"
|
| 113 |
-
fi
|
| 114 |
-
else
|
| 115 |
-
echo ""
|
| 116 |
-
echo " ℹ️ Production tests skipped (use --prod flag to run with real model)"
|
| 117 |
-
fi
|
| 118 |
-
|
| 119 |
-
# ═══════════════════════════════════════════════════════════════
|
| 120 |
-
# FINAL REPORT
|
| 121 |
-
# ═══════════════════════════════════════════════════════════════
|
| 122 |
-
|
| 123 |
-
echo ""
|
| 124 |
-
echo "╔══════════════════════════════════════════════════════════╗"
|
| 125 |
-
echo "║ FINAL RESULTS ║"
|
| 126 |
-
echo "╚══════════════════════════════════════════════════════════╝"
|
| 127 |
-
echo ""
|
| 128 |
-
echo " Total suites: $TOTAL"
|
| 129 |
-
echo " Passed: $PASS"
|
| 130 |
-
echo " Failed: $FAIL"
|
| 131 |
-
echo ""
|
| 132 |
-
|
| 133 |
-
if [ $FAIL -eq 0 ]; then
|
| 134 |
-
echo " ✅ ALL $TOTAL TEST SUITES PASSED — ZERO FAILURES"
|
| 135 |
-
echo ""
|
| 136 |
-
echo " Ready to publish: purpose-agent==3.0.0"
|
| 137 |
-
echo " Next step: python build_and_publish.py"
|
| 138 |
-
exit 0
|
| 139 |
-
else
|
| 140 |
-
echo " ❌ $FAIL SUITES FAILED — FIX BEFORE PUBLISHING"
|
| 141 |
-
exit 1
|
| 142 |
-
fi
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/results/launch_readiness.json
DELETED
|
@@ -1,602 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"pass": 119,
|
| 3 |
-
"fail": 0,
|
| 4 |
-
"warn": 0,
|
| 5 |
-
"results": [
|
| 6 |
-
{
|
| 7 |
-
"category": "imports",
|
| 8 |
-
"test": "import purpose_agent",
|
| 9 |
-
"status": "PASS"
|
| 10 |
-
},
|
| 11 |
-
{
|
| 12 |
-
"category": "imports",
|
| 13 |
-
"test": "import types",
|
| 14 |
-
"status": "PASS"
|
| 15 |
-
},
|
| 16 |
-
{
|
| 17 |
-
"category": "imports",
|
| 18 |
-
"test": "import llm_backend",
|
| 19 |
-
"status": "PASS"
|
| 20 |
-
},
|
| 21 |
-
{
|
| 22 |
-
"category": "imports",
|
| 23 |
-
"test": "import actor",
|
| 24 |
-
"status": "PASS"
|
| 25 |
-
},
|
| 26 |
-
{
|
| 27 |
-
"category": "imports",
|
| 28 |
-
"test": "import purpose_function",
|
| 29 |
-
"status": "PASS"
|
| 30 |
-
},
|
| 31 |
-
{
|
| 32 |
-
"category": "imports",
|
| 33 |
-
"test": "import experience_replay",
|
| 34 |
-
"status": "PASS"
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"category": "imports",
|
| 38 |
-
"test": "import optimizer",
|
| 39 |
-
"status": "PASS"
|
| 40 |
-
},
|
| 41 |
-
{
|
| 42 |
-
"category": "imports",
|
| 43 |
-
"test": "import orchestrator",
|
| 44 |
-
"status": "PASS"
|
| 45 |
-
},
|
| 46 |
-
{
|
| 47 |
-
"category": "imports",
|
| 48 |
-
"test": "import slm_backends",
|
| 49 |
-
"status": "PASS"
|
| 50 |
-
},
|
| 51 |
-
{
|
| 52 |
-
"category": "imports",
|
| 53 |
-
"test": "import streaming",
|
| 54 |
-
"status": "PASS"
|
| 55 |
-
},
|
| 56 |
-
{
|
| 57 |
-
"category": "imports",
|
| 58 |
-
"test": "import tools",
|
| 59 |
-
"status": "PASS"
|
| 60 |
-
},
|
| 61 |
-
{
|
| 62 |
-
"category": "imports",
|
| 63 |
-
"test": "import observability",
|
| 64 |
-
"status": "PASS"
|
| 65 |
-
},
|
| 66 |
-
{
|
| 67 |
-
"category": "imports",
|
| 68 |
-
"test": "import multi_agent",
|
| 69 |
-
"status": "PASS"
|
| 70 |
-
},
|
| 71 |
-
{
|
| 72 |
-
"category": "imports",
|
| 73 |
-
"test": "import hitl",
|
| 74 |
-
"status": "PASS"
|
| 75 |
-
},
|
| 76 |
-
{
|
| 77 |
-
"category": "imports",
|
| 78 |
-
"test": "import evaluation",
|
| 79 |
-
"status": "PASS"
|
| 80 |
-
},
|
| 81 |
-
{
|
| 82 |
-
"category": "imports",
|
| 83 |
-
"test": "import registry",
|
| 84 |
-
"status": "PASS"
|
| 85 |
-
},
|
| 86 |
-
{
|
| 87 |
-
"category": "imports",
|
| 88 |
-
"test": "import unified",
|
| 89 |
-
"status": "PASS"
|
| 90 |
-
},
|
| 91 |
-
{
|
| 92 |
-
"category": "imports",
|
| 93 |
-
"test": "import easy",
|
| 94 |
-
"status": "PASS"
|
| 95 |
-
},
|
| 96 |
-
{
|
| 97 |
-
"category": "imports",
|
| 98 |
-
"test": "import v2_types",
|
| 99 |
-
"status": "PASS"
|
| 100 |
-
},
|
| 101 |
-
{
|
| 102 |
-
"category": "imports",
|
| 103 |
-
"test": "import trace",
|
| 104 |
-
"status": "PASS"
|
| 105 |
-
},
|
| 106 |
-
{
|
| 107 |
-
"category": "imports",
|
| 108 |
-
"test": "import memory",
|
| 109 |
-
"status": "PASS"
|
| 110 |
-
},
|
| 111 |
-
{
|
| 112 |
-
"category": "imports",
|
| 113 |
-
"test": "import compiler",
|
| 114 |
-
"status": "PASS"
|
| 115 |
-
},
|
| 116 |
-
{
|
| 117 |
-
"category": "imports",
|
| 118 |
-
"test": "import immune",
|
| 119 |
-
"status": "PASS"
|
| 120 |
-
},
|
| 121 |
-
{
|
| 122 |
-
"category": "imports",
|
| 123 |
-
"test": "import memory_ci",
|
| 124 |
-
"status": "PASS"
|
| 125 |
-
},
|
| 126 |
-
{
|
| 127 |
-
"category": "imports",
|
| 128 |
-
"test": "import evalport",
|
| 129 |
-
"status": "PASS"
|
| 130 |
-
},
|
| 131 |
-
{
|
| 132 |
-
"category": "imports",
|
| 133 |
-
"test": "import benchmark_v2",
|
| 134 |
-
"status": "PASS"
|
| 135 |
-
},
|
| 136 |
-
{
|
| 137 |
-
"category": "imports",
|
| 138 |
-
"test": "import meta_rewarding",
|
| 139 |
-
"status": "PASS"
|
| 140 |
-
},
|
| 141 |
-
{
|
| 142 |
-
"category": "imports",
|
| 143 |
-
"test": "import self_taught",
|
| 144 |
-
"status": "PASS"
|
| 145 |
-
},
|
| 146 |
-
{
|
| 147 |
-
"category": "imports",
|
| 148 |
-
"test": "import prompt_optimizer",
|
| 149 |
-
"status": "PASS"
|
| 150 |
-
},
|
| 151 |
-
{
|
| 152 |
-
"category": "imports",
|
| 153 |
-
"test": "import llm_compiler",
|
| 154 |
-
"status": "PASS"
|
| 155 |
-
},
|
| 156 |
-
{
|
| 157 |
-
"category": "imports",
|
| 158 |
-
"test": "import retroformer",
|
| 159 |
-
"status": "PASS"
|
| 160 |
-
},
|
| 161 |
-
{
|
| 162 |
-
"category": "imports",
|
| 163 |
-
"test": "import robust_parser",
|
| 164 |
-
"status": "PASS"
|
| 165 |
-
},
|
| 166 |
-
{
|
| 167 |
-
"category": "imports",
|
| 168 |
-
"test": "import breakthroughs",
|
| 169 |
-
"status": "PASS"
|
| 170 |
-
},
|
| 171 |
-
{
|
| 172 |
-
"category": "instantiate",
|
| 173 |
-
"test": "State",
|
| 174 |
-
"status": "PASS"
|
| 175 |
-
},
|
| 176 |
-
{
|
| 177 |
-
"category": "instantiate",
|
| 178 |
-
"test": "Action",
|
| 179 |
-
"status": "PASS"
|
| 180 |
-
},
|
| 181 |
-
{
|
| 182 |
-
"category": "instantiate",
|
| 183 |
-
"test": "MockLLMBackend",
|
| 184 |
-
"status": "PASS"
|
| 185 |
-
},
|
| 186 |
-
{
|
| 187 |
-
"category": "instantiate",
|
| 188 |
-
"test": "ExperienceReplay",
|
| 189 |
-
"status": "PASS"
|
| 190 |
-
},
|
| 191 |
-
{
|
| 192 |
-
"category": "instantiate",
|
| 193 |
-
"test": "ToolRegistry",
|
| 194 |
-
"status": "PASS"
|
| 195 |
-
},
|
| 196 |
-
{
|
| 197 |
-
"category": "instantiate",
|
| 198 |
-
"test": "CalculatorTool",
|
| 199 |
-
"status": "PASS"
|
| 200 |
-
},
|
| 201 |
-
{
|
| 202 |
-
"category": "instantiate",
|
| 203 |
-
"test": "PythonExecTool",
|
| 204 |
-
"status": "PASS"
|
| 205 |
-
},
|
| 206 |
-
{
|
| 207 |
-
"category": "instantiate",
|
| 208 |
-
"test": "CostTracker",
|
| 209 |
-
"status": "PASS"
|
| 210 |
-
},
|
| 211 |
-
{
|
| 212 |
-
"category": "instantiate",
|
| 213 |
-
"test": "CallbackManager",
|
| 214 |
-
"status": "PASS"
|
| 215 |
-
},
|
| 216 |
-
{
|
| 217 |
-
"category": "instantiate",
|
| 218 |
-
"test": "Agent",
|
| 219 |
-
"status": "PASS"
|
| 220 |
-
},
|
| 221 |
-
{
|
| 222 |
-
"category": "instantiate",
|
| 223 |
-
"test": "KnowledgeStore",
|
| 224 |
-
"status": "PASS"
|
| 225 |
-
},
|
| 226 |
-
{
|
| 227 |
-
"category": "instantiate",
|
| 228 |
-
"test": "Graph",
|
| 229 |
-
"status": "PASS"
|
| 230 |
-
},
|
| 231 |
-
{
|
| 232 |
-
"category": "instantiate",
|
| 233 |
-
"test": "RunMode",
|
| 234 |
-
"status": "PASS"
|
| 235 |
-
},
|
| 236 |
-
{
|
| 237 |
-
"category": "instantiate",
|
| 238 |
-
"test": "Trace",
|
| 239 |
-
"status": "PASS"
|
| 240 |
-
},
|
| 241 |
-
{
|
| 242 |
-
"category": "instantiate",
|
| 243 |
-
"test": "MemoryStore",
|
| 244 |
-
"status": "PASS"
|
| 245 |
-
},
|
| 246 |
-
{
|
| 247 |
-
"category": "instantiate",
|
| 248 |
-
"test": "MemoryCard",
|
| 249 |
-
"status": "PASS"
|
| 250 |
-
},
|
| 251 |
-
{
|
| 252 |
-
"category": "instantiate",
|
| 253 |
-
"test": "MemoryCI",
|
| 254 |
-
"status": "PASS"
|
| 255 |
-
},
|
| 256 |
-
{
|
| 257 |
-
"category": "instantiate",
|
| 258 |
-
"test": "MixtureOfHeuristics",
|
| 259 |
-
"status": "PASS"
|
| 260 |
-
},
|
| 261 |
-
{
|
| 262 |
-
"category": "instantiate",
|
| 263 |
-
"test": "AdversarialHardener",
|
| 264 |
-
"status": "PASS"
|
| 265 |
-
},
|
| 266 |
-
{
|
| 267 |
-
"category": "core",
|
| 268 |
-
"test": "Full loop completes",
|
| 269 |
-
"status": "PASS"
|
| 270 |
-
},
|
| 271 |
-
{
|
| 272 |
-
"category": "core",
|
| 273 |
-
"test": "Trajectory has steps",
|
| 274 |
-
"status": "PASS"
|
| 275 |
-
},
|
| 276 |
-
{
|
| 277 |
-
"category": "core",
|
| 278 |
-
"test": "Final state exists",
|
| 279 |
-
"status": "PASS"
|
| 280 |
-
},
|
| 281 |
-
{
|
| 282 |
-
"category": "phi",
|
| 283 |
-
"test": "phi_before in [0,10]",
|
| 284 |
-
"status": "PASS"
|
| 285 |
-
},
|
| 286 |
-
{
|
| 287 |
-
"category": "phi",
|
| 288 |
-
"test": "phi_after in [0,10]",
|
| 289 |
-
"status": "PASS"
|
| 290 |
-
},
|
| 291 |
-
{
|
| 292 |
-
"category": "phi",
|
| 293 |
-
"test": "confidence in [0,1]",
|
| 294 |
-
"status": "PASS"
|
| 295 |
-
},
|
| 296 |
-
{
|
| 297 |
-
"category": "optimizer",
|
| 298 |
-
"test": "Produces heuristics",
|
| 299 |
-
"status": "PASS"
|
| 300 |
-
},
|
| 301 |
-
{
|
| 302 |
-
"category": "replay",
|
| 303 |
-
"test": "Store works",
|
| 304 |
-
"status": "PASS"
|
| 305 |
-
},
|
| 306 |
-
{
|
| 307 |
-
"category": "replay",
|
| 308 |
-
"test": "Retrieve works",
|
| 309 |
-
"status": "PASS"
|
| 310 |
-
},
|
| 311 |
-
{
|
| 312 |
-
"category": "replay",
|
| 313 |
-
"test": "Clear works",
|
| 314 |
-
"status": "PASS"
|
| 315 |
-
},
|
| 316 |
-
{
|
| 317 |
-
"category": "backend",
|
| 318 |
-
"test": "Strip <think> basic",
|
| 319 |
-
"status": "PASS"
|
| 320 |
-
},
|
| 321 |
-
{
|
| 322 |
-
"category": "backend",
|
| 323 |
-
"test": "Strip <think> multiline",
|
| 324 |
-
"status": "PASS"
|
| 325 |
-
},
|
| 326 |
-
{
|
| 327 |
-
"category": "backend",
|
| 328 |
-
"test": "Strip unclosed <think>",
|
| 329 |
-
"status": "PASS"
|
| 330 |
-
},
|
| 331 |
-
{
|
| 332 |
-
"category": "backend",
|
| 333 |
-
"test": "No tags passthrough",
|
| 334 |
-
"status": "PASS"
|
| 335 |
-
},
|
| 336 |
-
{
|
| 337 |
-
"category": "routing",
|
| 338 |
-
"test": "ollama: prefix",
|
| 339 |
-
"status": "PASS"
|
| 340 |
-
},
|
| 341 |
-
{
|
| 342 |
-
"category": "routing",
|
| 343 |
-
"test": "auto-detect ollama model",
|
| 344 |
-
"status": "PASS"
|
| 345 |
-
},
|
| 346 |
-
{
|
| 347 |
-
"category": "tools",
|
| 348 |
-
"test": "Calculator safe: 2+3*4=14",
|
| 349 |
-
"status": "PASS"
|
| 350 |
-
},
|
| 351 |
-
{
|
| 352 |
-
"category": "tools",
|
| 353 |
-
"test": "Calculator safe: sqrt(16)=4.0",
|
| 354 |
-
"status": "PASS"
|
| 355 |
-
},
|
| 356 |
-
{
|
| 357 |
-
"category": "tools",
|
| 358 |
-
"test": "Calculator blocks __import__",
|
| 359 |
-
"status": "PASS"
|
| 360 |
-
},
|
| 361 |
-
{
|
| 362 |
-
"category": "tools",
|
| 363 |
-
"test": "ReadFile blocks /etc/passwd",
|
| 364 |
-
"status": "PASS"
|
| 365 |
-
},
|
| 366 |
-
{
|
| 367 |
-
"category": "tools",
|
| 368 |
-
"test": "WriteFile blocks /tmp/evil",
|
| 369 |
-
"status": "PASS"
|
| 370 |
-
},
|
| 371 |
-
{
|
| 372 |
-
"category": "runmode",
|
| 373 |
-
"test": "TRAIN allows write",
|
| 374 |
-
"status": "PASS"
|
| 375 |
-
},
|
| 376 |
-
{
|
| 377 |
-
"category": "runmode",
|
| 378 |
-
"test": "EVAL blocks write",
|
| 379 |
-
"status": "PASS"
|
| 380 |
-
},
|
| 381 |
-
{
|
| 382 |
-
"category": "runmode",
|
| 383 |
-
"test": "EVAL is_eval",
|
| 384 |
-
"status": "PASS"
|
| 385 |
-
},
|
| 386 |
-
{
|
| 387 |
-
"category": "trace",
|
| 388 |
-
"test": "Events recorded",
|
| 389 |
-
"status": "PASS"
|
| 390 |
-
},
|
| 391 |
-
{
|
| 392 |
-
"category": "trace",
|
| 393 |
-
"test": "JSONL roundtrip",
|
| 394 |
-
"status": "PASS"
|
| 395 |
-
},
|
| 396 |
-
{
|
| 397 |
-
"category": "memory",
|
| 398 |
-
"test": "7 MemoryKinds",
|
| 399 |
-
"status": "PASS"
|
| 400 |
-
},
|
| 401 |
-
{
|
| 402 |
-
"category": "memory",
|
| 403 |
-
"test": "5 MemoryStatuses",
|
| 404 |
-
"status": "PASS"
|
| 405 |
-
},
|
| 406 |
-
{
|
| 407 |
-
"category": "memory",
|
| 408 |
-
"test": "Scoped retrieve",
|
| 409 |
-
"status": "PASS"
|
| 410 |
-
},
|
| 411 |
-
{
|
| 412 |
-
"category": "compiler",
|
| 413 |
-
"test": "Respects token budget",
|
| 414 |
-
"status": "PASS"
|
| 415 |
-
},
|
| 416 |
-
{
|
| 417 |
-
"category": "compiler",
|
| 418 |
-
"test": "Returns memory IDs",
|
| 419 |
-
"status": "PASS"
|
| 420 |
-
},
|
| 421 |
-
{
|
| 422 |
-
"category": "immune",
|
| 423 |
-
"test": "Safe passes",
|
| 424 |
-
"status": "PASS"
|
| 425 |
-
},
|
| 426 |
-
{
|
| 427 |
-
"category": "immune",
|
| 428 |
-
"test": "Injection blocked",
|
| 429 |
-
"status": "PASS"
|
| 430 |
-
},
|
| 431 |
-
{
|
| 432 |
-
"category": "immune",
|
| 433 |
-
"test": "Score hack blocked",
|
| 434 |
-
"status": "PASS"
|
| 435 |
-
},
|
| 436 |
-
{
|
| 437 |
-
"category": "immune",
|
| 438 |
-
"test": "API key blocked",
|
| 439 |
-
"status": "PASS"
|
| 440 |
-
},
|
| 441 |
-
{
|
| 442 |
-
"category": "immune",
|
| 443 |
-
"test": "Tool misuse blocked",
|
| 444 |
-
"status": "PASS"
|
| 445 |
-
},
|
| 446 |
-
{
|
| 447 |
-
"category": "ci",
|
| 448 |
-
"test": "Good \u2192 quarantined",
|
| 449 |
-
"status": "PASS"
|
| 450 |
-
},
|
| 451 |
-
{
|
| 452 |
-
"category": "ci",
|
| 453 |
-
"test": "Promote works",
|
| 454 |
-
"status": "PASS"
|
| 455 |
-
},
|
| 456 |
-
{
|
| 457 |
-
"category": "ci",
|
| 458 |
-
"test": "Injection \u2192 rejected",
|
| 459 |
-
"status": "PASS"
|
| 460 |
-
},
|
| 461 |
-
{
|
| 462 |
-
"category": "agent",
|
| 463 |
-
"test": "Agent.run() completes",
|
| 464 |
-
"status": "PASS"
|
| 465 |
-
},
|
| 466 |
-
{
|
| 467 |
-
"category": "graph",
|
| 468 |
-
"test": "Conditional routing",
|
| 469 |
-
"status": "PASS"
|
| 470 |
-
},
|
| 471 |
-
{
|
| 472 |
-
"category": "parallel",
|
| 473 |
-
"test": "3 tasks complete",
|
| 474 |
-
"status": "PASS"
|
| 475 |
-
},
|
| 476 |
-
{
|
| 477 |
-
"category": "conversation",
|
| 478 |
-
"test": "Messages produced",
|
| 479 |
-
"status": "PASS"
|
| 480 |
-
},
|
| 481 |
-
{
|
| 482 |
-
"category": "knowledge",
|
| 483 |
-
"test": "Chunks stored",
|
| 484 |
-
"status": "PASS"
|
| 485 |
-
},
|
| 486 |
-
{
|
| 487 |
-
"category": "knowledge",
|
| 488 |
-
"test": "Query returns results",
|
| 489 |
-
"status": "PASS"
|
| 490 |
-
},
|
| 491 |
-
{
|
| 492 |
-
"category": "knowledge",
|
| 493 |
-
"test": "as_tool() works",
|
| 494 |
-
"status": "PASS"
|
| 495 |
-
},
|
| 496 |
-
{
|
| 497 |
-
"category": "easy",
|
| 498 |
-
"test": "purpose() auto-detects coding team",
|
| 499 |
-
"status": "PASS"
|
| 500 |
-
},
|
| 501 |
-
{
|
| 502 |
-
"category": "easy",
|
| 503 |
-
"test": "purpose() auto-detects research team",
|
| 504 |
-
"status": "PASS"
|
| 505 |
-
},
|
| 506 |
-
{
|
| 507 |
-
"category": "easy",
|
| 508 |
-
"test": "Team.build() works",
|
| 509 |
-
"status": "PASS"
|
| 510 |
-
},
|
| 511 |
-
{
|
| 512 |
-
"category": "research",
|
| 513 |
-
"test": "MetaRewardingLoop importable",
|
| 514 |
-
"status": "PASS"
|
| 515 |
-
},
|
| 516 |
-
{
|
| 517 |
-
"category": "research",
|
| 518 |
-
"test": "SelfTaughtEvaluator importable",
|
| 519 |
-
"status": "PASS"
|
| 520 |
-
},
|
| 521 |
-
{
|
| 522 |
-
"category": "research",
|
| 523 |
-
"test": "PromptOptimizer importable",
|
| 524 |
-
"status": "PASS"
|
| 525 |
-
},
|
| 526 |
-
{
|
| 527 |
-
"category": "research",
|
| 528 |
-
"test": "LLMCompiler importable",
|
| 529 |
-
"status": "PASS"
|
| 530 |
-
},
|
| 531 |
-
{
|
| 532 |
-
"category": "research",
|
| 533 |
-
"test": "Retroformer importable",
|
| 534 |
-
"status": "PASS"
|
| 535 |
-
},
|
| 536 |
-
{
|
| 537 |
-
"category": "research",
|
| 538 |
-
"test": "PromptOptimizer.compile_prompt works",
|
| 539 |
-
"status": "PASS"
|
| 540 |
-
},
|
| 541 |
-
{
|
| 542 |
-
"category": "research",
|
| 543 |
-
"test": "LLMCompiler plans tasks",
|
| 544 |
-
"status": "PASS"
|
| 545 |
-
},
|
| 546 |
-
{
|
| 547 |
-
"category": "research",
|
| 548 |
-
"test": "LLMCompiler executes plan",
|
| 549 |
-
"status": "PASS"
|
| 550 |
-
},
|
| 551 |
-
{
|
| 552 |
-
"category": "B2-MoH",
|
| 553 |
-
"test": "Shared identified",
|
| 554 |
-
"status": "PASS"
|
| 555 |
-
},
|
| 556 |
-
{
|
| 557 |
-
"category": "B2-MoH",
|
| 558 |
-
"test": "Total K=5 selected",
|
| 559 |
-
"status": "PASS"
|
| 560 |
-
},
|
| 561 |
-
{
|
| 562 |
-
"category": "B6-adversarial",
|
| 563 |
-
"test": "Catch rate 95%",
|
| 564 |
-
"status": "PASS"
|
| 565 |
-
},
|
| 566 |
-
{
|
| 567 |
-
"category": "B6-adversarial",
|
| 568 |
-
"test": "FP rate 0%",
|
| 569 |
-
"status": "PASS"
|
| 570 |
-
},
|
| 571 |
-
{
|
| 572 |
-
"category": "parser",
|
| 573 |
-
"test": "TOML actor parse",
|
| 574 |
-
"status": "PASS"
|
| 575 |
-
},
|
| 576 |
-
{
|
| 577 |
-
"category": "parser",
|
| 578 |
-
"test": "JSON actor parse",
|
| 579 |
-
"status": "PASS"
|
| 580 |
-
},
|
| 581 |
-
{
|
| 582 |
-
"category": "parser",
|
| 583 |
-
"test": "TOML critic parse",
|
| 584 |
-
"status": "PASS"
|
| 585 |
-
},
|
| 586 |
-
{
|
| 587 |
-
"category": "parser",
|
| 588 |
-
"test": "Extract code from markdown",
|
| 589 |
-
"status": "PASS"
|
| 590 |
-
},
|
| 591 |
-
{
|
| 592 |
-
"category": "benchmark",
|
| 593 |
-
"test": "Improvement curve: [1.0, 10.0, 10.0]",
|
| 594 |
-
"status": "PASS"
|
| 595 |
-
},
|
| 596 |
-
{
|
| 597 |
-
"category": "benchmark",
|
| 598 |
-
"test": "Heuristics learned: 6",
|
| 599 |
-
"status": "PASS"
|
| 600 |
-
}
|
| 601 |
-
]
|
| 602 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|