clean: remove temp scripts, patches, and dev artifacts before public release

Browse files

Files changed (13) hide show

TEST_FIXES.md +0 -104
apply_fixes.py +0 -116
apply_llm_timeout.py +0 -15
apply_validate_fix.py +0 -15
benchmarks/results/track2_report.txt +0 -32
benchmarks/results/track2_results.json +0 -104
build_and_publish.py +0 -147
local_test_with_api.sh +0 -87
patches/llm_backend_timeout.patch +0 -13
patches/openai_backend_init.py +0 -15
patches/validate_mock_fix.patch +0 -11
run_all_tests.sh +0 -142
tests/results/launch_readiness.json +0 -602

TEST_FIXES.md DELETED Viewed

@@ -1,104 +0,0 @@
-# Test Fixes Applied for v3.0.0
-## Issue 1: Trajectory None guards (FIXED)
-- File: `purpose_agent/types.py` — UPDATED
-- Changed: cumulative_reward, total_delta, success_rate properties now check both `s.score is not None` AND `s.score.delta is not None`
-- Added docstring note that sre_patches.py replaces these at import time
-- Baseline and SRE-patched versions now equivalent
-## Issue 2: Backpressure test flakiness (NEEDS MANUAL FIX)
-- File: `tests/test_sprint1_events.py` — T1.6 section
-- Problem: async consumer may not start before flooding; terminal event might not arrive
-- Fix: Replace the test_backpressure() function with this more robust version:
-```python
-async def test_backpressure():
-    bus6 = EventBus(max_queue_size=3)
-    received = []
-    consumer_started = asyncio.Event()
-    async def consumer():
-        consumer_started.set()
-        try:
-            async for event in bus6.subscribe():
-                received.append(event)
-                await asyncio.sleep(0.01)
-        except asyncio.CancelledError:
-            pass
-    task = asyncio.create_task(consumer())
-    await consumer_started.wait()
-    await asyncio.sleep(0.05)
-    for i in range(20):
-        bus6.emit(create_event("r6", EventKind.TEXT_DELTA, seq=i, text=f"w{i}"))
-    bus6.emit(create_event("r6", EventKind.RUN_FINISHED, seq=99, result="done"))
-    await asyncio.sleep(1.0)
-    bus6.close()
-    task.cancel()
-    try:
-        await asyncio.wait_for(task, timeout=2.0)
-    except (asyncio.CancelledError, asyncio.TimeoutError):
-        pass
-    has_terminal = any(e.kind == EventKind.RUN_FINISHED for e in received)
-    return has_terminal
-```
-Key changes:
-- Added `consumer_started` Event to ensure consumer is running before flooding
-- Increased final wait from 0.5s to 1.0s
-- Added `asyncio.wait_for` timeout on task cleanup
-## Issue 3: prod_test.py API timeout (NEEDS MANUAL FIX)
-- File: `tests/prod_test.py`
-- Problem: No timeout on OpenRouter API calls; tests could hang
-- Fix: Wrap the backend creation with a timeout, add retry logic:
-After line `b = resolve_backend(...)`, add:
-```python
-import signal
-class TimeoutError(Exception):
-    pass
-def timeout_handler(signum, frame):
-    raise TimeoutError("API call timed out")
-# Set a 60s alarm for API calls
-signal.signal(signal.SIGALRM, timeout_handler)
-```
-Or simpler: in the resolve_backend call, add timeout to the OpenAI client:
-```python
-# In llm_backend.py OpenAICompatibleBackend.__init__, add:
-self.client = OpenAI(
-    base_url=base_url,
-    api_key=api_key or os.environ.get("OPENAI_API_KEY"),
-    timeout=60.0,  # 60 second timeout on all API calls
-)
-```
-## Issue 4: validate.py mock resilience (NEEDS MANUAL FIX)
-- File: `benchmarks/validate.py`
-- Problem: Mock matches on "Learned Strategies" + "None yet" text; fragile if prompt format changes
-- Fix: In make_mock(), make the heuristic detection more resilient:
-Change: `has_h = "Learned Strategies" in text and "None yet" not in text`
-To: `has_h = ("Learned Strategies" in text or "Learned Strategies" in text) and "None yet" not in text and "heuristics" in text.lower()`
-Or better: check the heuristic count directly:
-```python
-has_h = any("When:" in line or "Do:" in line for line in text.split("\n"))
-```
-## Issue 5: CalculatorTool __import__ blocking (VERIFIED WORKING)
-- File: `purpose_agent/tools.py`
-- CalculatorTool.execute() validates tokens with: `if re.search(r'[a-zA-Z_]', tokens)`
-- After removing known function names (abs, round, sqrt, etc.), any remaining letters are rejected
-- `__import__("os")` → after removing known functions, `__import__` and `os` remain → rejected ✓
-- Also: AST walker checks Call nodes and rejects unknown function names
-- eval() uses `{"__builtins__": {}}` — no builtins available
-- Test in benchmark_v3.py: `check("tools.calc_blocks_import", "Error" in calc.run(expression='__import__("os")').output)` — CORRECT

apply_fixes.py DELETED Viewed

@@ -1,116 +0,0 @@
-#!/usr/bin/env python3
-"""
-apply_fixes.py — Apply all 5 test fixes for purpose-agent v3.0.0.
-Run this AFTER cloning the repo, BEFORE running tests:
-    pip install huggingface_hub
-    python -c "from huggingface_hub import snapshot_download; snapshot_download('Rohan03/purpose-agent', local_dir='./pa', repo_type='model')"
-    cd pa
-    python apply_fixes.py
-    bash run_all_tests.sh
-"""
-import os
-import re
-def fix_file(path, description, old, new):
-    """Apply a single string replacement to a file."""
-    if not os.path.exists(path):
-        print(f"  ⚠️  {path} not found — skipping")
-        return False
-    with open(path, "r") as f:
-        content = f.read()
-    if old not in content:
-        print(f"  ⚠️  Pattern not found in {path} — may already be fixed")
-        return False
-    content = content.replace(old, new, 1)
-    with open(path, "w") as f:
-        f.write(content)
-    print(f"  ✅ {description}")
-    return True
-def main():
-    print("purpose-agent v3.0.0 — Applying test fixes\n")
-    fixes = 0
-    # ═══ Fix 1: Trajectory None guards (already applied in types.py on repo) ═══
-    print("Fix 1: Trajectory None guards in types.py")
-    f1 = fix_file(
-        "purpose_agent/types.py",
-        "Updated cumulative_reward to check score.delta is not None",
-        old='    @property\n    def cumulative_reward(self) -> float:\n        """Sum of all positive deltas in the trajectory."""\n        return sum(\n            s.score.delta for s in self.steps\n            if s.score is not None and s.score.delta > 0\n        )',
-        new='    @property\n    def cumulative_reward(self) -> float:\n        """Sum of all positive deltas in the trajectory (None-safe)."""\n        total = 0.0\n        for s in self.steps:\n            if s.score is not None and s.score.delta is not None and s.score.delta > 0:\n                total += s.score.delta\n        return total',
-    )
-    fixes += int(f1)
-    f1b = fix_file(
-        "purpose_agent/types.py",
-        "Updated total_delta to check score.delta is not None",
-        old='    @property\n    def total_delta(self) -> float:\n        """Net state improvement across the entire trajectory."""\n        return sum(\n            s.score.delta for s in self.steps if s.score is not None\n        )',
-        new='    @property\n    def total_delta(self) -> float:\n        """Net state improvement across the entire trajectory (None-safe)."""\n        total = 0.0\n        for s in self.steps:\n            if s.score is not None and s.score.delta is not None:\n                total += s.score.delta\n        return total',
-    )
-    fixes += int(f1b)
-    f1c = fix_file(
-        "purpose_agent/types.py",
-        "Updated success_rate to check score.delta is not None",
-        old='        scored = [s for s in self.steps if s.score is not None]\n        if not scored:\n            return 0.0\n        return sum(1 for s in scored if s.score.improved) / len(scored)',
-        new='        scored = [s for s in self.steps if s.score is not None and s.score.delta is not None]\n        if not scored:\n            return 0.0\n        return sum(1 for s in scored if s.score.improved) / len(scored)',
-    )
-    fixes += int(f1c)
-    # ═══ Fix 2: Backpressure test robustness (already applied in repo) ═══
-    print("\nFix 2: Backpressure test T1.6 in test_sprint1_events.py")
-    f2 = fix_file(
-        "tests/test_sprint1_events.py",
-        "Added consumer_started Event sync to backpressure test",
-        old='    bus6 = EventBus(max_queue_size=3)  # Very small queue\n\n    received = []\n\n    async def consumer():\n        async for event in bus6.subscribe():\n            received.append(event)\n            await asyncio.sleep(0.01)  # Slow consumer\n\n    # Start consumer\n    task = asyncio.create_task(consumer())\n    await asyncio.sleep(0.05)',
-        new='    bus6 = EventBus(max_queue_size=3)\n    received = []\n    consumer_started = asyncio.Event()\n\n    async def consumer():\n        consumer_started.set()\n        try:\n            async for event in bus6.subscribe():\n                received.append(event)\n                await asyncio.sleep(0.01)\n        except asyncio.CancelledError:\n            pass\n\n    task = asyncio.create_task(consumer())\n    await consumer_started.wait()\n    await asyncio.sleep(0.05)',
-    )
-    fixes += int(f2)
-    f2b = fix_file(
-        "tests/test_sprint1_events.py",
-        "Added longer wait and wait_for on task cleanup",
-        old='    await asyncio.sleep(0.5)\n    bus6.close()\n    task.cancel()\n    try:\n        await task\n    except asyncio.CancelledError:\n        pass',
-        new='    await asyncio.sleep(1.0)\n    bus6.close()\n    task.cancel()\n    try:\n        await asyncio.wait_for(task, timeout=2.0)\n    except (asyncio.CancelledError, asyncio.TimeoutError):\n        pass',
-    )
-    fixes += int(f2b)
-    # ═══ Fix 3: OpenAI backend timeout (already applied in prod_test.py) ═══
-    print("\nFix 3: Add 60s timeout to OpenAI-compatible backend")
-    f3 = fix_file(
-        "purpose_agent/llm_backend.py",
-        "Added timeout parameter to OpenAICompatibleBackend",
-        old='    def __init__(\n        self,\n        model: str = "gpt-4o",\n        base_url: str | None = None,\n        api_key: str | None = None,\n    ):\n        from openai import OpenAI\n\n        self.model = model\n        self.client = OpenAI(\n            base_url=base_url,\n            api_key=api_key or os.environ.get("OPENAI_API_KEY"),\n        )',
-        new='    def __init__(\n        self,\n        model: str = "gpt-4o",\n        base_url: str | None = None,\n        api_key: str | None = None,\n        timeout: float = 60.0,\n    ):\n        from openai import OpenAI\n\n        self.model = model\n        self.client = OpenAI(\n            base_url=base_url,\n            api_key=api_key or os.environ.get("OPENAI_API_KEY"),\n            timeout=timeout,\n        )',
-    )
-    fixes += int(f3)
-    # ═══ Fix 4: validate.py mock heuristic detection ═══
-    print("\nFix 4: Make mock heuristic detection more resilient")
-    f4 = fix_file(
-        "benchmarks/validate.py",
-        "Broadened heuristic detection from exact string to include 'When:' pattern",
-        old='        has_h = "Learned Strategies" in text and "None yet" not in text',
-        new='        has_h = ("Learned Strategies" in text or "When:" in text) and "None yet" not in text',
-    )
-    fixes += int(f4)
-    # ═══ Fix 5: CalculatorTool verification (no code change needed) ═══
-    print("\nFix 5: CalculatorTool __import__ blocking — VERIFIED (no change needed)")
-    print("  ✅ CalculatorTool.execute() rejects letters after removing known functions")
-    print("  ✅ AST walker rejects unknown function calls")
-    print("  ✅ eval() uses empty __builtins__")
-    print("  ✅ benchmark_v3.py test: 'Error' in calc.run('__import__(\"os\")').output")
-    # ═══ Summary ═══
-    print(f"\n{'='*50}")
-    print(f"  Fixes applied: {fixes}")
-    print(f"  Fix 5: Verified (no change needed)")
-    print(f"{'='*50}")
-    print("\n  Next: bash run_all_tests.sh")
-if __name__ == "__main__":
-    main()

apply_llm_timeout.py DELETED Viewed

@@ -1,15 +0,0 @@
-#!/usr/bin/env python3
-"""Apply the OpenAI backend timeout fix directly."""
-import os
-path = os.path.join(os.path.dirname(__file__), "purpose_agent", "llm_backend.py")
-with open(path, "r") as f:
-    content = f.read()
-old = '    def __init__(\n        self,\n        model: str = "gpt-4o",\n        base_url: str | None = None,\n        api_key: str | None = None,\n    ):\n        from openai import OpenAI\n\n        self.model = model\n        self.client = OpenAI(\n            base_url=base_url,\n            api_key=api_key or os.environ.get("OPENAI_API_KEY"),\n        )'
-new = '    def __init__(\n        self,\n        model: str = "gpt-4o",\n        base_url: str | None = None,\n        api_key: str | None = None,\n        timeout: float = 60.0,\n    ):\n        from openai import OpenAI\n\n        self.model = model\n        self.client = OpenAI(\n            base_url=base_url,\n            api_key=api_key or os.environ.get("OPENAI_API_KEY"),\n            timeout=timeout,\n        )'
-if old in content:
-    content = content.replace(old, new, 1)
-    with open(path, "w") as f:
-        f.write(content)
-    print("✅ Applied OpenAI backend timeout fix (60s)")
-else:
-    print("⚠️ Pattern not found — may already be patched")

apply_validate_fix.py DELETED Viewed

@@ -1,15 +0,0 @@
-#!/usr/bin/env python3
-"""Apply the validate.py mock heuristic detection fix."""
-import os
-path = os.path.join(os.path.dirname(__file__), "benchmarks", "validate.py")
-with open(path, "r") as f:
-    content = f.read()
-old = '        has_h = "Learned Strategies" in text and "None yet" not in text'
-new = '        has_h = ("Learned Strategies" in text or "When:" in text) and "None yet" not in text'
-if old in content:
-    content = content.replace(old, new, 1)
-    with open(path, "w") as f:
-        f.write(content)
-    print("✅ Applied validate.py heuristic detection fix")
-else:
-    print("⚠️ Pattern not found — may already be patched")

benchmarks/results/track2_report.txt DELETED Viewed

@@ -1,32 +0,0 @@
-╔════════════════════════════════════════════════════╗
-║  Purpose Agent — Track 2 Validation Report        ║
-╚════════════════════════════════════════════════════╝
-═══ Improvement Curves ═══
-Task            Run  Steps      Φ   Pass%  Heur
-────────────────────────────────────────────────
-fibonacci         1      2    5.0    50%     3
-fibonacci         2      1    5.0   100%     9
-fibonacci         3      1    5.0   100%    18
-  → Δ(Φ) = +0.0 (no change)
-factorial         1      2    5.0     0%     3
-factorial         2      1    5.0   100%     9
-factorial         3      1    5.0   100%    18
-  → Δ(Φ) = +0.0 (no change)
-═══ Cold vs Warm ═══
-  fibonacci      cold=5.0  warm=5.0  Δ=+0.0
-  factorial      cold=5.0  warm=5.0  Δ=+0.0
-═══ Cross-Task Transfer (['fibonacci', 'factorial'] → ['palindrome', 'fizzbuzz']) ═══
-  30 heuristics transferred
-  palindrome: ✓ Φ=5.0
-  fizzbuzz: ✓ Φ=5.0
-═══ Adversarial Robustness: 100% (8/8) ═══
-═══ VERDICT ═══
-  ✗ Self-improvement: NOT demonstrated
-  ✗ Cold/warm: no benefit from memory
-  ✓ Immune system: 100% adversarial accuracy

benchmarks/results/track2_results.json DELETED Viewed

@@ -1,104 +0,0 @@
-{
-  "curves": {
-    "fibonacci": [
-      {
-        "run": 1,
-        "steps": 2,
-        "phi": 5.0,
-        "pass_rate": 0.5,
-        "all_passed": false,
-        "heuristics": 3,
-        "time": 0.01
-      },
-      {
-        "run": 2,
-        "steps": 1,
-        "phi": 5.0,
-        "pass_rate": 1.0,
-        "all_passed": true,
-        "heuristics": 9,
-        "time": 0.0
-      },
-      {
-        "run": 3,
-        "steps": 1,
-        "phi": 5.0,
-        "pass_rate": 1.0,
-        "all_passed": true,
-        "heuristics": 18,
-        "time": 0.0
-      }
-    ],
-    "factorial": [
-      {
-        "run": 1,
-        "steps": 2,
-        "phi": 5.0,
-        "pass_rate": 0.0,
-        "all_passed": false,
-        "heuristics": 3,
-        "time": 0.0
-      },
-      {
-        "run": 2,
-        "steps": 1,
-        "phi": 5.0,
-        "pass_rate": 1.0,
-        "all_passed": true,
-        "heuristics": 9,
-        "time": 0.0
-      },
-      {
-        "run": 3,
-        "steps": 1,
-        "phi": 5.0,
-        "pass_rate": 1.0,
-        "all_passed": true,
-        "heuristics": 18,
-        "time": 0.0
-      }
-    ]
-  },
-  "cold_warm": [
-    {
-      "task": "fibonacci",
-      "cold_phi": 5.0,
-      "warm_phi": 5.0,
-      "delta": 0.0,
-      "improved": false
-    },
-    {
-      "task": "factorial",
-      "cold_phi": 5.0,
-      "warm_phi": 5.0,
-      "delta": 0.0,
-      "improved": false
-    }
-  ],
-  "transfer": {
-    "train": [
-      "fibonacci",
-      "factorial"
-    ],
-    "test": [
-      "palindrome",
-      "fizzbuzz"
-    ],
-    "heuristics": 30,
-    "results": {
-      "palindrome": {
-        "phi": 5.0,
-        "passed": true
-      },
-      "fizzbuzz": {
-        "phi": 5.0,
-        "passed": true
-      }
-    }
-  },
-  "adversarial": {
-    "total": 8,
-    "correct": 8,
-    "accuracy": 1.0
-  }
-}

build_and_publish.py DELETED Viewed

@@ -1,147 +0,0 @@
-#!/usr/bin/env python3
-"""
-build_and_publish.py — Build purpose-agent v3.0.0 and publish to PyPI.
-Prerequisites:
-    pip install build twine
-Usage:
-    python build_and_publish.py              # Build + publish
-    python build_and_publish.py --build-only # Build only, don't publish
-    python build_and_publish.py --check      # Build + twine check, don't publish
-Environment:
-    PYPI_TOKEN — PyPI API token (or pass as argument)
-"""
-import os
-import sys
-import shutil
-import subprocess
-def run(cmd, **kwargs):
-    """Run a command and check for errors."""
-    print(f"  $ {cmd}")
-    result = subprocess.run(cmd, shell=True, **kwargs)
-    if result.returncode != 0:
-        print(f"  ❌ Command failed with exit code {result.returncode}")
-        sys.exit(1)
-    return result
-def main():
-    build_only = "--build-only" in sys.argv
-    check_only = "--check" in sys.argv
-    # Get PyPI token
-    pypi_token = os.environ.get("PYPI_TOKEN", "")
-    if not build_only and not check_only:
-        # Try to get from command line args
-        for arg in sys.argv:
-            if arg.startswith("pypi-"):
-                pypi_token = arg
-                break
-        if not pypi_token:
-            print("⚠️  No PyPI token found. Set PYPI_TOKEN env var or pass as argument.")
-            print("   Usage: python build_and_publish.py pypi-AgE...")
-            if not build_only:
-                sys.exit(1)
-    # Verify version
-    print("\n═══ Step 1: Verify version ═══")
-    sys.path.insert(0, ".")
-    import purpose_agent
-    version = purpose_agent.__version__
-    print(f"  Version: {version}")
-    if version != "3.0.0":
-        print(f"  ❌ Version is {version}, expected 3.0.0!")
-        sys.exit(1)
-    print(f"  ✅ Version confirmed: {version}")
-    # Verify imports
-    print("\n═══ Step 2: Verify imports ═══")
-    missing = [n for n in purpose_agent.__all__ if not hasattr(purpose_agent, n)]
-    if missing:
-        print(f"  ❌ Missing exports: {missing}")
-        sys.exit(1)
-    print(f"  ✅ All {len(purpose_agent.__all__)} exports importable")
-    # Clean old builds
-    print("\n═══ Step 3: Clean old builds ═══")
-    for path in ["dist", "build", "*.egg-info"]:
-        if os.path.exists(path):
-            shutil.rmtree(path)
-            print(f"  Removed: {path}")
-    # Also clean any .egg-info in current dir
-    for item in os.listdir("."):
-        if item.endswith(".egg-info"):
-            shutil.rmtree(item)
-            print(f"  Removed: {item}")
-    print("  ✅ Cleaned")
-    # Build
-    print("\n═══ Step 4: Build sdist + wheel ═══")
-    run("python -m build")
-    # List artifacts
-    print("\n  Artifacts:")
-    if os.path.exists("dist"):
-        for f in os.listdir("dist"):
-            size = os.path.getsize(os.path.join("dist", f))
-            print(f"    {f} ({size:,} bytes)")
-    # Twine check
-    print("\n═══ Step 5: Twine check ═══")
-    run("twine check dist/*")
-    if check_only:
-        print("\n  ✅ Build + check complete. Not publishing (use without --check).")
-        return
-    if build_only:
-        print("\n  ✅ Build complete. Not publishing (use without --build-only).")
-        return
-    # Publish
-    print("\n═══ Step 6: Publish to PyPI ═══")
-    print(f"  Package: purpose-agent=={version}")
-    print(f"  Target: https://pypi.org/project/purpose-agent/")
-    print()
-    run(
-        f'twine upload dist/* '
-        f'--username __token__ '
-        f'--password "{pypi_token}"'
-    )
-    # Verify
-    print("\n═══ Step 7: Verify on PyPI ═══")
-    print(f"  Waiting 10s for PyPI to index...")
-    subprocess.run(["sleep", "10"])
-    print(f"  Installing from PyPI...")
-    run(f'pip install purpose-agent=={version}')
-    result = subprocess.run(
-        ['python', '-c', f'''
-import purpose_agent as pa
-print(f"  v{{pa.__version__}} — {{len(pa.__all__)}} exports")
-assert pa.__version__ == "{version}", f"Version mismatch: {{pa.__version__}}"
-'''],
-        capture_output=True, text=True,
-    )
-    print(result.stdout)
-    if result.returncode != 0:
-        print(f"  ⚠️  Verification failed: {result.stderr}")
-    else:
-        print("  ✅ PyPI install verified!")
-    print(f"\n{'='*60}")
-    print(f"  ✅ purpose-agent=={version} PUBLISHED TO PYPI!")
-    print(f"  📦 https://pypi.org/project/purpose-agent/{version}/")
-    print(f"  📦 https://huggingface.co/Rohan03/purpose-agent")
-    print(f"{'='*60}")
-if __name__ == "__main__":
-    main()

local_test_with_api.sh DELETED Viewed

@@ -1,87 +0,0 @@
-#!/usr/bin/env bash
-# ═══════════════════════════════════════════════════════════════
-# purpose-agent v3.0.0 — Local Test Runner (including cloud model)
-#
-# Run on your own machine (no HF credits needed):
-#   git clone https://huggingface.co/Rohan03/purpose-agent pa && cd pa
-#   pip install -e .
-#   export OPENROUTER_API_KEY="sk-or-v1-..."
-#   bash local_test_with_api.sh
-# ═══════════════════════════════════════════════════════════════
-set -e
-PASS=0; FAIL=0; TOTAL=0
-run_test() {
-    local name="$1"; local cmd="$2"
-    echo ""
-    echo "══════════════════════════════════════════════════════════"
-    echo "  Running: $name"
-    echo "══════════════════════════════════════════════════════════"
-    if eval "$cmd"; then
-        echo "  ✅ $name PASSED"; PASS=$((PASS+1))
-    else
-        echo "  ❌ $name FAILED"; FAIL=$((FAIL+1))
-    fi
-    TOTAL=$((TOTAL+1))
-}
-echo "╔══════════════════════════════════════════════════════════╗"
-echo "║  purpose-agent v3.0.0 — Full Test Suite (Local + Cloud)║"
-echo "╚══════════════════════════════════════════════════════════╝"
-# Pre-flight
-echo "═══ Pre-flight ═══"
-python -c "
-import purpose_agent as pa
-print(f'  v{pa.__version__} — {len(pa.__all__)} exports')
-assert pa.__version__ == '3.0.0', f'Version: {pa.__version__}'
-assert len(pa.__all__) >= 110, f'Exports: {len(pa.__all__)}'
-missing = [n for n in pa.__all__ if not hasattr(pa, n)]
-assert not missing, f'Missing: {missing}'
-print('  ✅ All exports importable')
-"
-# Apply fixes
-echo ""
-echo "═══ Applying test fixes ═══"
-python apply_fixes.py
-# Layer 1: Unit Tests
-run_test "test_core" "python tests/test_core.py"
-run_test "test_public_api_211" "python tests/compat/test_public_api_211.py"
-run_test "test_first_principles" "python tests/test_first_principles.py"
-run_test "test_hardening" "python tests/test_hardening.py"
-run_test "test_sre_regression" "python tests/test_sre_regression.py"
-# Layer 2: Feature Tests
-run_test "test_sprint1_events" "python tests/test_sprint1_events.py"
-run_test "test_sprint2_checkpoint" "python tests/test_sprint2_checkpoint.py"
-run_test "test_sprint3_homeostasis" "python tests/test_sprint3_homeostasis.py"
-run_test "test_sprint4_8_protocols" "python tests/test_sprint4_8_protocols.py"
-run_test "test_track_c" "python tests/test_track_c.py"
-run_test "test_track_d" "python tests/test_track_d.py"
-# Layer 3: Integration
-run_test "validate_quick" "python benchmarks/validate.py --quick"
-run_test "benchmark_v3" "python -m purpose_agent.benchmark_v3"
-# Layer 4: Cloud Model Tests
-if [ -n "$OPENROUTER_API_KEY" ]; then
-    run_test "prod_test (real model)" "python tests/prod_test.py"
-else
-    echo "  ⚠️  OPENROUTER_API_KEY not set — skipping cloud tests"
-fi
-# Report
-echo ""
-echo "╔══════════════════════════════════════════════════════════╗"
-echo "║  RESULTS: $PASS/$TOTAL passed, $FAIL failed"
-if [ $FAIL -eq 0 ]; then
-echo "║  ✅ ALL TESTS PASSED — READY TO PUBLISH"
-echo "║  Next: python build_and_publish.py"
-else
-echo "║  ❌ $FAIL FAILURES — FIX BEFORE PUBLISHING"
-fi
-echo "╚══════════════════════════════════════════════════════════╝"
-exit $FAIL

patches/llm_backend_timeout.patch DELETED Viewed

@@ -1,13 +0,0 @@
---- a/purpose_agent/llm_backend.py
-+++ b/purpose_agent/llm_backend.py
-@@ -199,7 +199,7 @@
-     ):
-         from openai import OpenAI
-         self.model = model
-         self.client = OpenAI(
-             base_url=base_url,
--            api_key=api_key or os.environ.get("OPENAI_API_KEY"),
-+            api_key=api_key or os.environ.get("OPENAI_API_KEY"),
-+            timeout=60.0,
-         )

patches/openai_backend_init.py DELETED Viewed

@@ -1,15 +0,0 @@
-    def __init__(
-        self,
-        model: str = "gpt-4o",
-        base_url: str | None = None,
-        api_key: str | None = None,
-        timeout: float = 60.0,
-    ):
-        from openai import OpenAI
-        self.model = model
-        self.client = OpenAI(
-            base_url=base_url,
-            api_key=api_key or os.environ.get("OPENAI_API_KEY"),
-            timeout=timeout,
-        )

patches/validate_mock_fix.patch DELETED Viewed

@@ -1,11 +0,0 @@
---- a/benchmarks/validate.py
-+++ b/benchmarks/validate.py
-@@ -107,7 +107,7 @@
- def make_mock(task_name):
-     mock = MockLLMBackend()
-     t = TASKS[task_name]
-     def actor(msgs):
-         text = " ".join(m.content for m in msgs)
--        has_h = "Learned Strategies" in text and "None yet" not in text
-+        has_h = ("Learned Strategies" in text or "When:" in text) and "None yet" not in text
-         code = t["good"] if has_h else t["bad"]

run_all_tests.sh DELETED Viewed

@@ -1,142 +0,0 @@
-#!/usr/bin/env bash
-# ═══════════════════════════════════════════════════════════════
-# purpose-agent v3.0.0 — Complete Test Suite Runner
-#
-# Usage:
-#   chmod +x run_all_tests.sh
-#   ./run_all_tests.sh              # All tests (mock only)
-#   ./run_all_tests.sh --prod       # Include real model tests (needs OPENROUTER_API_KEY)
-# ═══════════════════════════════════════════════════════════════
-set -e
-PASS=0
-FAIL=0
-TOTAL=0
-run_test() {
-    local name="$1"
-    local cmd="$2"
-    echo ""
-    echo "══════════════════════════════════════════════════════════"
-    echo "  Running: $name"
-    echo "══════════════════════════════════════════════════════════"
-    if eval "$cmd"; then
-        echo "  ✅ $name PASSED"
-        PASS=$((PASS + 1))
-    else
-        echo "  ❌ $name FAILED"
-        FAIL=$((FAIL + 1))
-    fi
-    TOTAL=$((TOTAL + 1))
-}
-echo "╔══════════════════════════════════════════════════════════╗"
-echo "║  purpose-agent v3.0.0 — Complete Test Suite            ║"
-echo "╚══════════════════════════════════════════════════════════╝"
-echo ""
-# ── Pre-flight: verify package imports ──
-echo "═══ Pre-flight: Package Import Check ═══"
-python -c "
-import purpose_agent as pa
-print(f'  v{pa.__version__} — {len(pa.__all__)} exports')
-assert pa.__version__ == '3.0.0', f'Version mismatch: {pa.__version__}'
-assert len(pa.__all__) >= 110, f'Not enough exports: {len(pa.__all__)}'
-missing = [n for n in pa.__all__ if not hasattr(pa, n)]
-assert len(missing) == 0, f'Missing exports: {missing}'
-print('  ✅ All exports importable')
-"
-# ═══════════════════════════════════════════════════════════════
-# LAYER 1: Unit Tests
-# ═══════════════════════════════════════════════════════════════
-run_test "test_core (basic loop, Φ bounds, optimizer, replay, immune)" \
-    "python tests/test_core.py"
-run_test "test_public_api_211 (all 120+ exports, Level 1/2/3)" \
-    "python tests/compat/test_public_api_211.py"
-run_test "test_first_principles (state-delta O(1), falsification, PEP 578)" \
-    "python tests/test_first_principles.py"
-run_test "test_hardening (null safety, timeouts, validation)" \
-    "python tests/test_hardening.py"
-run_test "test_sre_regression (5 critical vulnerability scenarios)" \
-    "python tests/test_sre_regression.py"
-# ═══════════════════════════════════════════════════════════════
-# LAYER 2: Feature Tests
-# ═══════════════════════════════════════════════════════════════
-run_test "test_sprint1_events (event bus, lanes, CoT rejection)" \
-    "python tests/test_sprint1_events.py"
-run_test "test_sprint2_checkpoint (durable execution, resume, idempotency)" \
-    "python tests/test_sprint2_checkpoint.py"
-run_test "test_sprint3_homeostasis (memory budget, consolidation, hibernation)" \
-    "python tests/test_sprint3_homeostasis.py"
-run_test "test_sprint4_8_protocols (MCP, A2A, AG-UI, AGENTS.md, quorum)" \
-    "python tests/test_sprint4_8_protocols.py"
-run_test "test_track_c (routing, MAS generator, skills)" \
-    "python tests/test_track_c.py"
-run_test "test_track_d (fingerprint, dataset, prompt pack, optimizer, distillation)" \
-    "python tests/test_track_d.py"
-# ═══════════════════════════════════════════════════════════════
-# LAYER 3: Integration Tests
-# ═══════════════════════════════════════════════════════════════
-run_test "validate.py --quick (improvement curves + adversarial)" \
-    "python benchmarks/validate.py --quick"
-run_test "benchmark_v3 (35+ robustness checks across all subsystems)" \
-    "python -m purpose_agent.benchmark_v3"
-# ═══════════════════════════════════════════════════════════════
-# LAYER 4: Production Tests (optional — needs API key)
-# ═══════════════════════════════════════════════════════════════
-if [ "$1" = "--prod" ]; then
-    if [ -z "$OPENROUTER_API_KEY" ]; then
-        echo "⚠️  OPENROUTER_API_KEY not set — skipping prod tests"
-        echo "   Set it with: export OPENROUTER_API_KEY=sk-or-v1-..."
-    else
-        run_test "prod_test (real model Level 1/2/3 + coding + security)" \
-            "python tests/prod_test.py"
-    fi
-else
-    echo ""
-    echo "  ℹ️  Production tests skipped (use --prod flag to run with real model)"
-fi
-# ═══════════════════════════════════════════════════════════════
-# FINAL REPORT
-# ═══════════════════════════════════════════════════════════════
-echo ""
-echo "╔══════════════════════════════════════════════════════════╗"
-echo "║  FINAL RESULTS                                         ║"
-echo "╚══════════════════════════════════════════════════════════╝"
-echo ""
-echo "  Total suites: $TOTAL"
-echo "  Passed:       $PASS"
-echo "  Failed:       $FAIL"
-echo ""
-if [ $FAIL -eq 0 ]; then
-    echo "  ✅ ALL $TOTAL TEST SUITES PASSED — ZERO FAILURES"
-    echo ""
-    echo "  Ready to publish: purpose-agent==3.0.0"
-    echo "  Next step: python build_and_publish.py"
-    exit 0
-else
-    echo "  ❌ $FAIL SUITES FAILED — FIX BEFORE PUBLISHING"
-    exit 1
-fi

tests/results/launch_readiness.json DELETED Viewed

@@ -1,602 +0,0 @@
-{
-  "pass": 119,
-  "fail": 0,
-  "warn": 0,
-  "results": [
-    {
-      "category": "imports",
-      "test": "import purpose_agent",
-      "status": "PASS"
-    },
-    {
-      "category": "imports",
-      "test": "import types",
-      "status": "PASS"
-    },
-    {
-      "category": "imports",
-      "test": "import llm_backend",
-      "status": "PASS"
-    },
-    {
-      "category": "imports",
-      "test": "import actor",
-      "status": "PASS"
-    },
-    {
-      "category": "imports",
-      "test": "import purpose_function",
-      "status": "PASS"
-    },
-    {
-      "category": "imports",
-      "test": "import experience_replay",
-      "status": "PASS"
-    },
-    {
-      "category": "imports",
-      "test": "import optimizer",
-      "status": "PASS"
-    },
-    {
-      "category": "imports",
-      "test": "import orchestrator",
-      "status": "PASS"
-    },
-    {
-      "category": "imports",
-      "test": "import slm_backends",
-      "status": "PASS"
-    },
-    {
-      "category": "imports",
-      "test": "import streaming",
-      "status": "PASS"
-    },
-    {
-      "category": "imports",
-      "test": "import tools",
-      "status": "PASS"
-    },
-    {
-      "category": "imports",
-      "test": "import observability",
-      "status": "PASS"
-    },
-    {
-      "category": "imports",
-      "test": "import multi_agent",
-      "status": "PASS"
-    },
-    {
-      "category": "imports",
-      "test": "import hitl",
-      "status": "PASS"
-    },
-    {
-      "category": "imports",
-      "test": "import evaluation",
-      "status": "PASS"
-    },
-    {
-      "category": "imports",
-      "test": "import registry",
-      "status": "PASS"
-    },
-    {
-      "category": "imports",
-      "test": "import unified",
-      "status": "PASS"
-    },
-    {
-      "category": "imports",
-      "test": "import easy",
-      "status": "PASS"
-    },
-    {
-      "category": "imports",
-      "test": "import v2_types",
-      "status": "PASS"
-    },
-    {
-      "category": "imports",
-      "test": "import trace",
-      "status": "PASS"
-    },
-    {
-      "category": "imports",
-      "test": "import memory",
-      "status": "PASS"
-    },
-    {
-      "category": "imports",
-      "test": "import compiler",
-      "status": "PASS"
-    },
-    {
-      "category": "imports",
-      "test": "import immune",
-      "status": "PASS"
-    },
-    {
-      "category": "imports",
-      "test": "import memory_ci",
-      "status": "PASS"
-    },
-    {
-      "category": "imports",
-      "test": "import evalport",
-      "status": "PASS"
-    },
-    {
-      "category": "imports",
-      "test": "import benchmark_v2",
-      "status": "PASS"
-    },
-    {
-      "category": "imports",
-      "test": "import meta_rewarding",
-      "status": "PASS"
-    },
-    {
-      "category": "imports",
-      "test": "import self_taught",
-      "status": "PASS"
-    },
-    {
-      "category": "imports",
-      "test": "import prompt_optimizer",
-      "status": "PASS"
-    },
-    {
-      "category": "imports",
-      "test": "import llm_compiler",
-      "status": "PASS"
-    },
-    {
-      "category": "imports",
-      "test": "import retroformer",
-      "status": "PASS"
-    },
-    {
-      "category": "imports",
-      "test": "import robust_parser",
-      "status": "PASS"
-    },
-    {
-      "category": "imports",
-      "test": "import breakthroughs",
-      "status": "PASS"
-    },
-    {
-      "category": "instantiate",
-      "test": "State",
-      "status": "PASS"
-    },
-    {
-      "category": "instantiate",
-      "test": "Action",
-      "status": "PASS"
-    },
-    {
-      "category": "instantiate",
-      "test": "MockLLMBackend",
-      "status": "PASS"
-    },
-    {
-      "category": "instantiate",
-      "test": "ExperienceReplay",
-      "status": "PASS"
-    },
-    {
-      "category": "instantiate",
-      "test": "ToolRegistry",
-      "status": "PASS"
-    },
-    {
-      "category": "instantiate",
-      "test": "CalculatorTool",
-      "status": "PASS"
-    },
-    {
-      "category": "instantiate",
-      "test": "PythonExecTool",
-      "status": "PASS"
-    },
-    {
-      "category": "instantiate",
-      "test": "CostTracker",
-      "status": "PASS"
-    },
-    {
-      "category": "instantiate",
-      "test": "CallbackManager",
-      "status": "PASS"
-    },
-    {
-      "category": "instantiate",
-      "test": "Agent",
-      "status": "PASS"
-    },
-    {
-      "category": "instantiate",
-      "test": "KnowledgeStore",
-      "status": "PASS"
-    },
-    {
-      "category": "instantiate",
-      "test": "Graph",
-      "status": "PASS"
-    },
-    {
-      "category": "instantiate",
-      "test": "RunMode",
-      "status": "PASS"
-    },
-    {
-      "category": "instantiate",
-      "test": "Trace",
-      "status": "PASS"
-    },
-    {
-      "category": "instantiate",
-      "test": "MemoryStore",
-      "status": "PASS"
-    },
-    {
-      "category": "instantiate",
-      "test": "MemoryCard",
-      "status": "PASS"
-    },
-    {
-      "category": "instantiate",
-      "test": "MemoryCI",
-      "status": "PASS"
-    },
-    {
-      "category": "instantiate",
-      "test": "MixtureOfHeuristics",
-      "status": "PASS"
-    },
-    {
-      "category": "instantiate",
-      "test": "AdversarialHardener",
-      "status": "PASS"
-    },
-    {
-      "category": "core",
-      "test": "Full loop completes",
-      "status": "PASS"
-    },
-    {
-      "category": "core",
-      "test": "Trajectory has steps",
-      "status": "PASS"
-    },
-    {
-      "category": "core",
-      "test": "Final state exists",
-      "status": "PASS"
-    },
-    {
-      "category": "phi",
-      "test": "phi_before in [0,10]",
-      "status": "PASS"
-    },
-    {
-      "category": "phi",
-      "test": "phi_after in [0,10]",
-      "status": "PASS"
-    },
-    {
-      "category": "phi",
-      "test": "confidence in [0,1]",
-      "status": "PASS"
-    },
-    {
-      "category": "optimizer",
-      "test": "Produces heuristics",
-      "status": "PASS"
-    },
-    {
-      "category": "replay",
-      "test": "Store works",
-      "status": "PASS"
-    },
-    {
-      "category": "replay",
-      "test": "Retrieve works",
-      "status": "PASS"
-    },
-    {
-      "category": "replay",
-      "test": "Clear works",
-      "status": "PASS"
-    },
-    {
-      "category": "backend",
-      "test": "Strip <think> basic",
-      "status": "PASS"
-    },
-    {
-      "category": "backend",
-      "test": "Strip <think> multiline",
-      "status": "PASS"
-    },
-    {
-      "category": "backend",
-      "test": "Strip unclosed <think>",
-      "status": "PASS"
-    },
-    {
-      "category": "backend",
-      "test": "No tags passthrough",
-      "status": "PASS"
-    },
-    {
-      "category": "routing",
-      "test": "ollama: prefix",
-      "status": "PASS"
-    },
-    {
-      "category": "routing",
-      "test": "auto-detect ollama model",
-      "status": "PASS"
-    },
-    {
-      "category": "tools",
-      "test": "Calculator safe: 2+3*4=14",
-      "status": "PASS"
-    },
-    {
-      "category": "tools",
-      "test": "Calculator safe: sqrt(16)=4.0",
-      "status": "PASS"
-    },
-    {
-      "category": "tools",
-      "test": "Calculator blocks __import__",
-      "status": "PASS"
-    },
-    {
-      "category": "tools",
-      "test": "ReadFile blocks /etc/passwd",
-      "status": "PASS"
-    },
-    {
-      "category": "tools",
-      "test": "WriteFile blocks /tmp/evil",
-      "status": "PASS"
-    },
-    {
-      "category": "runmode",
-      "test": "TRAIN allows write",
-      "status": "PASS"
-    },
-    {
-      "category": "runmode",
-      "test": "EVAL blocks write",
-      "status": "PASS"
-    },
-    {
-      "category": "runmode",
-      "test": "EVAL is_eval",
-      "status": "PASS"
-    },
-    {
-      "category": "trace",
-      "test": "Events recorded",
-      "status": "PASS"
-    },
-    {
-      "category": "trace",
-      "test": "JSONL roundtrip",
-      "status": "PASS"
-    },
-    {
-      "category": "memory",
-      "test": "7 MemoryKinds",
-      "status": "PASS"
-    },
-    {
-      "category": "memory",
-      "test": "5 MemoryStatuses",
-      "status": "PASS"
-    },
-    {
-      "category": "memory",
-      "test": "Scoped retrieve",
-      "status": "PASS"
-    },
-    {
-      "category": "compiler",
-      "test": "Respects token budget",
-      "status": "PASS"
-    },
-    {
-      "category": "compiler",
-      "test": "Returns memory IDs",
-      "status": "PASS"
-    },
-    {
-      "category": "immune",
-      "test": "Safe passes",
-      "status": "PASS"
-    },
-    {
-      "category": "immune",
-      "test": "Injection blocked",
-      "status": "PASS"
-    },
-    {
-      "category": "immune",
-      "test": "Score hack blocked",
-      "status": "PASS"
-    },
-    {
-      "category": "immune",
-      "test": "API key blocked",
-      "status": "PASS"
-    },
-    {
-      "category": "immune",
-      "test": "Tool misuse blocked",
-      "status": "PASS"
-    },
-    {
-      "category": "ci",
-      "test": "Good \u2192 quarantined",
-      "status": "PASS"
-    },
-    {
-      "category": "ci",
-      "test": "Promote works",
-      "status": "PASS"
-    },
-    {
-      "category": "ci",
-      "test": "Injection \u2192 rejected",
-      "status": "PASS"
-    },
-    {
-      "category": "agent",
-      "test": "Agent.run() completes",
-      "status": "PASS"
-    },
-    {
-      "category": "graph",
-      "test": "Conditional routing",
-      "status": "PASS"
-    },
-    {
-      "category": "parallel",
-      "test": "3 tasks complete",
-      "status": "PASS"
-    },
-    {
-      "category": "conversation",
-      "test": "Messages produced",
-      "status": "PASS"
-    },
-    {
-      "category": "knowledge",
-      "test": "Chunks stored",
-      "status": "PASS"
-    },
-    {
-      "category": "knowledge",
-      "test": "Query returns results",
-      "status": "PASS"
-    },
-    {
-      "category": "knowledge",
-      "test": "as_tool() works",
-      "status": "PASS"
-    },
-    {
-      "category": "easy",
-      "test": "purpose() auto-detects coding team",
-      "status": "PASS"
-    },
-    {
-      "category": "easy",
-      "test": "purpose() auto-detects research team",
-      "status": "PASS"
-    },
-    {
-      "category": "easy",
-      "test": "Team.build() works",
-      "status": "PASS"
-    },
-    {
-      "category": "research",
-      "test": "MetaRewardingLoop importable",
-      "status": "PASS"
-    },
-    {
-      "category": "research",
-      "test": "SelfTaughtEvaluator importable",
-      "status": "PASS"
-    },
-    {
-      "category": "research",
-      "test": "PromptOptimizer importable",
-      "status": "PASS"
-    },
-    {
-      "category": "research",
-      "test": "LLMCompiler importable",
-      "status": "PASS"
-    },
-    {
-      "category": "research",
-      "test": "Retroformer importable",
-      "status": "PASS"
-    },
-    {
-      "category": "research",
-      "test": "PromptOptimizer.compile_prompt works",
-      "status": "PASS"
-    },
-    {
-      "category": "research",
-      "test": "LLMCompiler plans tasks",
-      "status": "PASS"
-    },
-    {
-      "category": "research",
-      "test": "LLMCompiler executes plan",
-      "status": "PASS"
-    },
-    {
-      "category": "B2-MoH",
-      "test": "Shared identified",
-      "status": "PASS"
-    },
-    {
-      "category": "B2-MoH",
-      "test": "Total K=5 selected",
-      "status": "PASS"
-    },
-    {
-      "category": "B6-adversarial",
-      "test": "Catch rate 95%",
-      "status": "PASS"
-    },
-    {
-      "category": "B6-adversarial",
-      "test": "FP rate 0%",
-      "status": "PASS"
-    },
-    {
-      "category": "parser",
-      "test": "TOML actor parse",
-      "status": "PASS"
-    },
-    {
-      "category": "parser",
-      "test": "JSON actor parse",
-      "status": "PASS"
-    },
-    {
-      "category": "parser",
-      "test": "TOML critic parse",
-      "status": "PASS"
-    },
-    {
-      "category": "parser",
-      "test": "Extract code from markdown",
-      "status": "PASS"
-    },
-    {
-      "category": "benchmark",
-      "test": "Improvement curve: [1.0, 10.0, 10.0]",
-      "status": "PASS"
-    },
-    {
-      "category": "benchmark",
-      "test": "Heuristics learned: 6",
-      "status": "PASS"
-    }
-  ]
-}