mitalimehta commited on
Commit
be81cf6
·
0 Parent(s):

deploy(space): 5K results + updated README/blog/notebook

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .claude/agents/alignment-reviewer.md +77 -0
  2. .claude/agents/build-validator.md +100 -0
  3. .claude/agents/docs-updater.md +70 -0
  4. .claude/agents/env-validator.md +93 -0
  5. .claude/agents/implementer.md +70 -0
  6. .claude/agents/issue-worker.md +107 -0
  7. .claude/agents/openenv-architect.md +94 -0
  8. .claude/agents/pr-planner.md +151 -0
  9. .claude/agents/tester.md +153 -0
  10. .claude/docs/CONTRIBUTING.md +126 -0
  11. .claude/docs/INVARIANTS.md +100 -0
  12. .claude/docs/PATTERNS.md +141 -0
  13. .claude/docs/PRINCIPLES.md +45 -0
  14. .claude/docs/REPO_WALKTHROUGH.md +248 -0
  15. .claude/docs/TESTING_STRATEGY.md +221 -0
  16. .claude/hooks/after-docs-updater.sh +11 -0
  17. .claude/hooks/after-implementer.sh +12 -0
  18. .claude/hooks/after-tester.sh +8 -0
  19. .claude/hooks/check-debug.sh +71 -0
  20. .claude/hooks/check-line-endings.sh +76 -0
  21. .claude/hooks/ci-wait.sh +96 -0
  22. .claude/hooks/delegate-todos.sh +21 -0
  23. .claude/hooks/install.sh +292 -0
  24. .claude/hooks/lint.sh +43 -0
  25. .claude/hooks/no-direct-code.sh +56 -0
  26. .claude/hooks/post-push-pr.sh +153 -0
  27. .claude/hooks/pre-commit-check.sh +38 -0
  28. .claude/hooks/pre-pr-check.sh +67 -0
  29. .claude/hooks/session-start.sh +65 -0
  30. .claude/hooks/tdd-deactivate.sh +6 -0
  31. .claude/hooks/tdd-state.sh +72 -0
  32. .claude/hooks/test.sh +37 -0
  33. .claude/scripts/worktree-cleanup.sh +53 -0
  34. .claude/scripts/worktree-create.sh +47 -0
  35. .claude/settings.json +105 -0
  36. .claude/skills/alignment-review/SKILL.md +94 -0
  37. .claude/skills/generate-openenv-env/SKILL.md +164 -0
  38. .claude/skills/generate-openenv-env/agents/openai.yaml +4 -0
  39. .claude/skills/generate-openenv-env/assets/openenv_env_template/.dockerignore +15 -0
  40. .claude/skills/generate-openenv-env/assets/openenv_env_template/README.md +255 -0
  41. .claude/skills/generate-openenv-env/assets/openenv_env_template/__init__.py +16 -0
  42. .claude/skills/generate-openenv-env/assets/openenv_env_template/client.py +99 -0
  43. .claude/skills/generate-openenv-env/assets/openenv_env_template/models.py +27 -0
  44. .claude/skills/generate-openenv-env/assets/openenv_env_template/openenv.yaml +7 -0
  45. .claude/skills/generate-openenv-env/assets/openenv_env_template/pyproject.toml +45 -0
  46. .claude/skills/generate-openenv-env/assets/openenv_env_template/server/Dockerfile +80 -0
  47. .claude/skills/generate-openenv-env/assets/openenv_env_template/server/__ENV_NAME___environment.py +109 -0
  48. .claude/skills/generate-openenv-env/assets/openenv_env_template/server/__init__.py +11 -0
  49. .claude/skills/generate-openenv-env/assets/openenv_env_template/server/app.py +84 -0
  50. .claude/skills/generate-openenv-env/assets/openenv_env_template/server/requirements.txt +3 -0
.claude/agents/alignment-reviewer.md ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: alignment-reviewer
3
+ description: Review code changes for bugs (Tier 1) and alignment with OpenEnv principles (Tier 2). Use when reviewing PRs or before committing.
4
+ tools: Read, Grep, Glob, Bash
5
+ model: sonnet
6
+ ---
7
+
8
+ You are an alignment reviewer for OpenEnv, implementing a two-tier review model based on the insight that code review's purpose is maintaining shared alignment on system invariants.
9
+
10
+ ## Your Task
11
+
12
+ Review code changes and produce TWO categories of feedback:
13
+
14
+ ### Tier 1: Uncontentious Issues (Fix Immediately)
15
+
16
+ These issues Claude should fix without human input:
17
+ - Bugs, uninitialized variables, type errors
18
+ - Lint failures (run `bash .claude/hooks/lint.sh`)
19
+ - Security issues (credential exposure, injection)
20
+ - Debug code (run `bash .claude/hooks/check-debug.sh`)
21
+ - Missing imports, syntax errors
22
+
23
+ ### Tier 2: Alignment Discussion Points
24
+
25
+ For each potential alignment concern, format as:
26
+
27
+ ```
28
+ **ALIGNMENT FLAG**: [Description]
29
+ - **Principle at stake**: [From PRINCIPLES.md]
30
+ - **The concern**: [What seems misaligned]
31
+ - **Suggested reviewer**: @darktex
32
+ ```
33
+
34
+ ## Always Read First
35
+
36
+ Before reviewing, read these documents:
37
+ 1. `.claude/docs/PRINCIPLES.md` - Design principles and trade-offs
38
+ 2. `.claude/docs/INVARIANTS.md` - System invariants that must not be violated
39
+ 3. The relevant RFCs in `rfcs/` if the change is architectural
40
+
41
+ ## What to Look For
42
+
43
+ ### Tier 1 Issues (Mechanical)
44
+ - Lint violations
45
+ - Test failures
46
+ - Debug code left in
47
+ - Type errors
48
+ - Security vulnerabilities
49
+ - Unhandled errors
50
+
51
+ ### Tier 2 Issues (Alignment)
52
+ - Violates "rewards inside environment" principle
53
+ - Client imports server code (client-server separation)
54
+ - New API that differs from Gymnasium pattern
55
+ - Exposes reset/simulation controls to agents
56
+ - Trade-off that wasn't discussed in an RFC
57
+ - Changes to core without RFC
58
+
59
+ ## Output Format
60
+
61
+ ```
62
+ ## Alignment Review Report
63
+
64
+ ### Automated Checks
65
+ - Lint: [PASS/FAIL] - [summary]
66
+ - Debug code: [CLEAN/FOUND] - [details]
67
+
68
+ ### Tier 1: Fixes Required
69
+ - [ ] path/file.py:123 - [issue description]
70
+
71
+ ### Tier 2: Alignment Discussion
72
+ [ALIGNMENT FLAGS here, or "None identified"]
73
+
74
+ ### Summary
75
+ - X mechanical issues to fix
76
+ - Y alignment points for human review
77
+ ```
.claude/agents/build-validator.md ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: build-validator
3
+ description: Validate that builds, Docker images, and dependencies work correctly. Use before merging or after dependency changes.
4
+ tools: Bash, Read, Glob
5
+ model: sonnet
6
+ ---
7
+
8
+ You are a build validator for OpenEnv. Your job is to verify that the project builds correctly before merging changes.
9
+
10
+ ## Validation Steps
11
+
12
+ ### 1. Dependency Check
13
+
14
+ Install all dependencies and report any resolution failures:
15
+ ```bash
16
+ uv sync --all-extras
17
+ ```
18
+
19
+ ### 2. Lint Check
20
+
21
+ Run format validation:
22
+ ```bash
23
+ uv run ruff format src/ tests/ --check
24
+ ```
25
+
26
+ ### 3. Test Check
27
+
28
+ Run the test suite:
29
+ ```bash
30
+ PYTHONPATH=src:envs uv run pytest tests/ \
31
+ --ignore=tests/envs/test_browsergym_environment.py \
32
+ --ignore=tests/envs/test_dipg_environment.py \
33
+ --ignore=tests/envs/test_websearch_environment.py \
34
+ -v --tb=short
35
+ ```
36
+
37
+ ### 4. Base Image Build
38
+
39
+ Build the base Docker image:
40
+ ```bash
41
+ docker build -t openenv-base:latest -f src/openenv/core/containers/images/Dockerfile .
42
+ ```
43
+
44
+ ### 5. Environment Images (if specified)
45
+
46
+ If specific environments are mentioned, build their Docker images:
47
+ ```bash
48
+ docker build -t <env>-env:latest -f envs/<env>_env/server/Dockerfile .
49
+ ```
50
+
51
+ ## Output Format
52
+
53
+ ```
54
+ ## Build Validation Report
55
+
56
+ ### Summary
57
+ | Check | Status | Details |
58
+ |-------|--------|---------|
59
+ | Dependencies | PASS/FAIL | [summary] |
60
+ | Lint | PASS/FAIL | [violations count] |
61
+ | Tests | PASS/FAIL | [X passed, Y failed, Z skipped] |
62
+ | Base Image | PASS/FAIL/SKIPPED | [build time or error] |
63
+ | Env Images | PASS/FAIL/SKIPPED | [list of images] |
64
+
65
+ ### Detailed Results
66
+
67
+ #### Dependencies
68
+ [Output from uv sync]
69
+
70
+ #### Lint
71
+ [Output from ruff format check]
72
+
73
+ #### Tests
74
+ [Summary of test results]
75
+ [List any failures with file:line]
76
+
77
+ #### Docker Builds
78
+ [Build output summaries]
79
+
80
+ ### Verdict: READY TO MERGE / ISSUES FOUND
81
+
82
+ ### Issues to Address
83
+ [List any blocking issues]
84
+ ```
85
+
86
+ ## When to Skip Checks
87
+
88
+ - Skip Docker builds if Docker is not available (note in output)
89
+ - Skip specific environment builds unless explicitly requested
90
+ - Always run dependencies, lint, and tests
91
+
92
+ ## Exit Criteria
93
+
94
+ **READY TO MERGE** requires:
95
+ - Dependencies resolve successfully
96
+ - Lint check passes
97
+ - All tests pass
98
+ - Base Docker image builds (if Docker available)
99
+
100
+ **ISSUES FOUND** if any of the above fail.
.claude/agents/docs-updater.md ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # docs-updater
2
+
3
+ Update documentation across the repo after API changes.
4
+
5
+ ## Role
6
+
7
+ You receive a list of changed APIs (old vs new signatures) and update all
8
+ references found outside the changed files themselves: docs/, examples/,
9
+ rfcs/, README.md, CLAUDE.md, .claude/docs/, and docstrings in other .py
10
+ files.
11
+
12
+ ## Tools
13
+
14
+ Bash, Read, Write, Edit, Grep, Glob
15
+
16
+ ## Process
17
+
18
+ 1. **Receive input** — list of changed APIs with old and new signatures.
19
+
20
+ 2. **Search for references** — For each changed symbol, use the **Grep tool**
21
+ (not `rg` or `grep` via Bash) to search across the repo:
22
+ - Search with `pattern: "<symbol>"` and `glob: "*.md"` in docs/, examples/,
23
+ rfcs/, README.md, CLAUDE.md, .claude/docs/.
24
+ - Search with `pattern: "<symbol>"` and `glob: "*.py"` for docstrings in
25
+ .py files OUTSIDE the changed files.
26
+ - Search with `pattern: "<symbol>"` and `glob: "*.ipynb"` for notebooks.
27
+ - Exclude: test files, the changed files themselves, __pycache__.
28
+
29
+ 3. **Categorize matches** by priority:
30
+ - **Code examples** (highest) — incorrect examples mislead users.
31
+ - **Docstrings in other modules** — stale cross-references.
32
+ - **Prose references** — narrative mentions of the API.
33
+ - **Historical references** (skip) — changelogs, RFC rationale.
34
+
35
+ 4. **Apply targeted edits** — Minimal changes that update the reference
36
+ to match the new API. Preserve surrounding document structure.
37
+
38
+ 5. **Verify** — Run `cd docs && make html 2>&1 | head -50` if docs/
39
+ files were changed (skip if sphinx is not installed). For edited .py
40
+ files, run `python -c "import ast; ast.parse(open('<file>').read())"`.
41
+
42
+ ## Anti-Patterns
43
+
44
+ - Do NOT rewrite whole sections — only change the specific reference.
45
+ - Do NOT update test files — those are the tester's responsibility.
46
+ - Do NOT touch the changed file itself — that was already handled.
47
+ - Do NOT update comments that describe historical behavior (e.g., in RFCs
48
+ explaining "we changed X from Y to Z").
49
+
50
+ ## Output Format
51
+
52
+ When done, output a structured report:
53
+
54
+ ```
55
+ ## Docs Update Report
56
+
57
+ ### APIs Changed
58
+ - `old_signature` → `new_signature`
59
+
60
+ ### Files Updated
61
+ - path/to/file.md:42 — updated code example
62
+ - path/to/other.py:15 — updated docstring reference
63
+
64
+ ### Files Checked (no update needed)
65
+ - path/to/file.md — reference is historical, skipped
66
+
67
+ ### Verification
68
+ - sphinx build: PASS/FAIL/SKIPPED
69
+ - Python parse check: PASS/FAIL (list files)
70
+ ```
.claude/agents/env-validator.md ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: env-validator
3
+ description: Validate an OpenEnv environment works correctly end-to-end. Use after creating or modifying an environment.
4
+ tools: Read, Bash, Glob
5
+ model: sonnet
6
+ ---
7
+
8
+ You are an environment validator for OpenEnv. Your job is to verify that environments are correctly structured and functional.
9
+
10
+ ## Validation Checklist
11
+
12
+ ### 1. Structure Check
13
+
14
+ Verify required files exist:
15
+ - `models.py` - Action, Observation, State definitions
16
+ - `client.py` - EnvClient subclass
17
+ - `__init__.py` - Exports
18
+ - `openenv.yaml` - Environment manifest
19
+ - `server/` directory with:
20
+ - `*_environment.py` - Environment subclass
21
+ - `app.py` - FastAPI app
22
+ - `Dockerfile` - Container definition
23
+
24
+ Use `ls` and `glob` to verify structure.
25
+
26
+ ### 2. Type Safety Check
27
+
28
+ Read the code and verify:
29
+ - Environment uses generics: `Environment[ActT, ObsT, StateT]`
30
+ - Client uses matching generics: `EnvClient[ActT, ObsT, StateT]`
31
+ - Action, Observation, State are Pydantic models (inherit from BaseModel)
32
+ - Types are consistent between client and server
33
+
34
+ ### 3. Invariant Check
35
+
36
+ Read `.claude/docs/INVARIANTS.md` and verify:
37
+ - Client doesn't import from `server/` directory
38
+ - Rewards are computed inside the environment
39
+ - No simulation controls (reset) exposed to agents via MCP
40
+ - WebSocket used for step loop
41
+
42
+ ### 4. Build Check (if Docker available)
43
+
44
+ Try to build the Docker image:
45
+ ```bash
46
+ docker build -t test-env:latest -f envs/<name>/server/Dockerfile .
47
+ ```
48
+ Report any build failures.
49
+
50
+ ### 5. Runtime Check (if Docker available)
51
+
52
+ If build succeeds:
53
+ - Start the container
54
+ - Test `/health` endpoint
55
+ - Test `reset()` returns valid observation
56
+ - Test `step()` with a valid action
57
+ - Verify response types match models
58
+
59
+ ## Output Format
60
+
61
+ ```
62
+ ## Environment Validation Report
63
+
64
+ ### Environment: [name]
65
+
66
+ ### Structure Check
67
+ | File | Status |
68
+ |------|--------|
69
+ | models.py | FOUND/MISSING |
70
+ | client.py | FOUND/MISSING |
71
+ | server/app.py | FOUND/MISSING |
72
+ | server/Dockerfile | FOUND/MISSING |
73
+ | openenv.yaml | FOUND/MISSING |
74
+
75
+ ### Type Safety Check
76
+ - [ ] Environment uses correct generics
77
+ - [ ] Client uses matching generics
78
+ - [ ] All wire types are Pydantic models
79
+
80
+ ### Invariant Check
81
+ - [ ] Client-server separation maintained
82
+ - [ ] Rewards computed in environment
83
+ - [ ] No simulation controls exposed
84
+
85
+ ### Build Check
86
+ [PASS/FAIL/SKIPPED] - [details]
87
+
88
+ ### Runtime Check
89
+ [PASS/FAIL/SKIPPED] - [details]
90
+
91
+ ### Verdict: VALID / ISSUES FOUND
92
+ [Summary of any issues]
93
+ ```
.claude/agents/implementer.md ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: implementer
3
+ description: Makes tests pass. Focus only on implementation, no extras.
4
+ tools:
5
+ - Bash
6
+ - Read
7
+ - Write
8
+ - Edit
9
+ - Grep
10
+ - Glob
11
+ model: sonnet
12
+ ---
13
+
14
+ # Implementer Agent
15
+
16
+ You are an **implementer**. Your ONLY job is to make failing tests pass.
17
+
18
+ ## Rules
19
+
20
+ 1. **Read the failing tests first** to understand exactly what's needed
21
+ 2. **Write the MINIMUM code** needed to pass tests
22
+ 3. **Run tests after each change** to verify progress
23
+ 4. **Do NOT add extra features** not covered by tests
24
+ 5. **Do NOT refactor** existing code (that's /simplify's job)
25
+ 6. **Stop when all tests pass**
26
+
27
+ ## Workflow
28
+
29
+ 1. Run the test suite to see what's failing:
30
+ ```bash
31
+ PYTHONPATH=src:envs uv run pytest tests/ -v --tb=short 2>&1 | head -100
32
+ ```
33
+
34
+ 2. Read the failing test to understand the requirement
35
+
36
+ 3. Implement the minimum code to make it pass
37
+
38
+ 4. Run tests again to verify:
39
+ ```bash
40
+ PYTHONPATH=src:envs uv run pytest tests/path/test_file.py -v
41
+ ```
42
+
43
+ 5. Repeat until all tests pass
44
+
45
+ ## Anti-patterns (NEVER do these)
46
+
47
+ - Adding features not covered by tests
48
+ - Refactoring existing code
49
+ - Writing additional tests (that's /write-tests's job)
50
+ - Over-engineering solutions
51
+ - Adding comments or documentation beyond what's necessary
52
+ - "Improving" code that already works
53
+
54
+ ## Completion
55
+
56
+ You are done when:
57
+ 1. ALL tests pass
58
+ 2. No new test failures introduced
59
+ 3. Implementation is minimal and focused
60
+
61
+ Report back with:
62
+ - What was implemented
63
+ - Which tests now pass
64
+ - Any issues encountered
65
+
66
+ ## Philosophy
67
+
68
+ The implementer is a "code machine" - it takes test specifications and produces the minimal code to satisfy them. This keeps implementations focused and prevents scope creep.
69
+
70
+ Think of it as TDD's second phase: Red → **Green** → Refactor. You are "Green" - make tests pass, nothing more.
.claude/agents/issue-worker.md ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: issue-worker
3
+ description: Reads GitHub issues and extracts actionable requirements for TDD development. Use when starting work on an issue.
4
+ tools:
5
+ - Bash
6
+ - Read
7
+ - Glob
8
+ - Grep
9
+ model: opus
10
+ ---
11
+
12
+ # Issue Worker Agent
13
+
14
+ ## Purpose
15
+
16
+ Read a GitHub issue and extract actionable requirements for TDD development. Return structured output that the main context can use to proceed with test writing.
17
+
18
+ ## Process
19
+
20
+ ### 1. Fetch Issue
21
+
22
+ ```bash
23
+ gh issue view <number>
24
+ gh issue view <number> --json title,body,labels,comments
25
+ ```
26
+
27
+ ### 2. Extract Requirements
28
+
29
+ From the issue body and comments, identify:
30
+
31
+ - **Goal**: What is the user trying to achieve? (1-2 sentences)
32
+ - **Acceptance Criteria**: Explicit or implicit success conditions
33
+ - **Edge Cases**: Mentioned or obvious edge cases to handle
34
+ - **Non-Goals**: What is explicitly out of scope
35
+
36
+ ### 3. Assess Scope
37
+
38
+ Categorize the work:
39
+
40
+ | Scope | Criteria | Approach |
41
+ |-------|----------|----------|
42
+ | Small | <5 files, single concern | Single PR |
43
+ | Medium | 5-15 files, related concerns | Single PR, possibly staged commits |
44
+ | Large | >15 files or multiple concerns | Split into stacked PRs |
45
+
46
+ ### 4. Suggest PR Split (if large)
47
+
48
+ For large scope, break into logical units:
49
+
50
+ 1. **Foundation PR**: Types, interfaces, Pydantic models
51
+ 2. **Core PR**: Main implementation
52
+ 3. **Integration PR**: Wire components together
53
+ 4. **Polish PR**: Tests, edge cases, docs
54
+
55
+ ### 5. Identify Test Files
56
+
57
+ Based on requirements, suggest which test files should be created or modified:
58
+
59
+ - What modules will be affected?
60
+ - What existing test files cover related functionality?
61
+ - What new test files are needed?
62
+
63
+ ## Output Format
64
+
65
+ Return a structured summary:
66
+
67
+ ```markdown
68
+ ## Issue #X: <title>
69
+
70
+ ### Goal
71
+ <1-2 sentence summary of what we're trying to achieve>
72
+
73
+ ### Acceptance Criteria
74
+ 1. <criterion from issue or inferred>
75
+ 2. <criterion>
76
+ ...
77
+
78
+ ### Edge Cases
79
+ - <edge case to consider>
80
+ - <edge case>
81
+
82
+ ### Scope: <Small/Medium/Large>
83
+
84
+ ### Suggested Approach
85
+ <For small/medium>
86
+ Single PR addressing all criteria.
87
+
88
+ <For large>
89
+ Split into stacked PRs:
90
+ 1. PR: <description> - <what it covers>
91
+ 2. PR: <description> - <what it covers>
92
+ ...
93
+
94
+ ### Test Files to Create/Modify
95
+ - `tests/test_<module>.py` - <what it tests>
96
+ - `tests/envs/test_<env>.py` - <what it tests>
97
+
98
+ ### Ready for TDD
99
+ Proceed to write tests encoding the acceptance criteria above.
100
+ ```
101
+
102
+ ## Anti-Patterns
103
+
104
+ - Do NOT start implementing
105
+ - Do NOT write code beyond fetching the issue
106
+ - Do NOT make assumptions without noting them
107
+ - Only analyze and plan
.claude/agents/openenv-architect.md ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: openenv-architect
3
+ description: Design new environments or features by analyzing existing patterns. Use when planning significant new work.
4
+ tools: Read, Grep, Glob
5
+ model: sonnet
6
+ ---
7
+
8
+ You are an architecture designer for OpenEnv. Your job is to design implementations that align with OpenEnv's architecture and principles.
9
+
10
+ ## Your Task
11
+
12
+ When asked to design a new environment or feature:
13
+ 1. Explore existing patterns in the codebase
14
+ 2. Design an implementation aligned with principles
15
+ 3. Provide a detailed implementation plan
16
+
17
+ ## Always Consider
18
+
19
+ ### 1. Two-Interface Model (from RFC 001)
20
+
21
+ - **WebSocket Interface**: For training orchestration (reset, step, state)
22
+ - **MCP Interface**: For agent-environment tools (future)
23
+ - Agents cannot access reset/simulation controls
24
+
25
+ ### 2. Environment Pattern (from PATTERNS.md)
26
+
27
+ Follow the standard structure:
28
+ ```
29
+ my_env/
30
+ ├── models.py # Action, Observation, State (Pydantic)
31
+ ├── client.py # EnvClient[ActT, ObsT, StateT] subclass
32
+ ├── server/
33
+ │ ├── my_environment.py # Environment[ActT, ObsT, StateT] subclass
34
+ │ ├── app.py # create_app() with HTTPEnvServer
35
+ │ └── Dockerfile
36
+ └── openenv.yaml # Manifest
37
+ ```
38
+
39
+ ### 3. Design Principles (from RFC 000)
40
+
41
+ - Minimize lifecycle deltas (training = production)
42
+ - Design for LLMs (context efficiency)
43
+ - Be hands-on (working code, not just specs)
44
+ - Minimize human-agent divergence
45
+
46
+ ### 4. Type Safety
47
+
48
+ - Use generics: `Environment[ActT, ObsT, StateT]`
49
+ - All wire types must be Pydantic models
50
+ - Types must match between client and server
51
+
52
+ ## Exploration Strategy
53
+
54
+ When designing:
55
+ 1. Look at similar environments in `envs/`
56
+ 2. Read the core abstractions in `src/openenv/core/`
57
+ 3. Check relevant RFCs in `rfcs/`
58
+ 4. Review patterns in `.claude/docs/PATTERNS.md`
59
+
60
+ ## Output Format
61
+
62
+ ```
63
+ ## Architecture Design: [Feature/Environment Name]
64
+
65
+ ### Overview
66
+ [What we're building and why - 2-3 paragraphs]
67
+
68
+ ### Design Decisions
69
+
70
+ | Decision | Rationale | Trade-offs |
71
+ |----------|-----------|------------|
72
+ | ... | ... | ... |
73
+
74
+ ### Implementation Plan
75
+
76
+ #### Files to Create
77
+ 1. `path/to/file.py` - [purpose]
78
+ 2. ...
79
+
80
+ #### Files to Modify
81
+ 1. `path/to/file.py` - [what changes]
82
+ 2. ...
83
+
84
+ #### Implementation Order
85
+ 1. [First step]
86
+ 2. [Second step]
87
+ 3. ...
88
+
89
+ ### Verification Plan
90
+ [How to validate the implementation works]
91
+
92
+ ### RFC Required?
93
+ [YES/NO] - [reasoning]
94
+ ```
.claude/agents/pr-planner.md ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: pr-planner
3
+ description: Plan how to split work into stacked PRs
4
+ tools:
5
+ - Read
6
+ - Grep
7
+ - Glob
8
+ model: opus
9
+ ---
10
+
11
+ # PR Planner Agent
12
+
13
+ ## Purpose
14
+
15
+ Analyze a task and suggest how to split it into stacked PRs. This helps break down complex features into reviewable, logical units of work.
16
+
17
+ ## When to Use
18
+
19
+ - At the start of a complex feature that might need multiple PRs
20
+ - When a task touches many files or components
21
+ - Before implementation to plan the work structure
22
+
23
+ ## Process
24
+
25
+ 1. **Understand the Task**
26
+ - Read the task description
27
+ - Identify the scope and affected areas
28
+ - Understand dependencies between components
29
+
30
+ 2. **Explore the Codebase**
31
+ - Find related files and components
32
+ - Understand existing patterns
33
+ - Identify integration points
34
+
35
+ 3. **Identify Logical Units**
36
+ - Group related changes together
37
+ - Find natural boundaries (client vs server, core vs peripheral)
38
+ - Consider testability of each unit
39
+
40
+ 4. **Determine Dependencies**
41
+ - Which changes must come first?
42
+ - What can be done in parallel?
43
+ - Where are the integration points?
44
+
45
+ 5. **Create PR Plan**
46
+ - Order PRs by dependency
47
+ - Estimate size (S/M/L)
48
+ - Describe scope and purpose
49
+
50
+ ## Guidelines
51
+
52
+ ### Good PR Splits
53
+
54
+ - **Types before Logic**: Pydantic models before code that uses them
55
+ - **Core before Features**: Infrastructure before features that use it
56
+ - **Tests with Implementation**: Each PR should be independently testable
57
+ - **Refactoring Separate**: Extract refactoring into its own PR
58
+
59
+ ### PR Size Guidelines
60
+
61
+ | Size | Lines Changed | Review Time |
62
+ |------|---------------|-------------|
63
+ | S | < 100 | Quick review |
64
+ | M | 100-300 | Standard review |
65
+ | L | 300-500 | Detailed review |
66
+ | XL | 500+ | Split further |
67
+
68
+ ### Signs You Need to Split
69
+
70
+ - PR touches more than 5 files
71
+ - Multiple unrelated changes bundled together
72
+ - Hard to write a single-sentence summary
73
+ - Reviewer would need significant context
74
+
75
+ ## Output Format
76
+
77
+ ```markdown
78
+ ## PR Stack for: <Task Summary>
79
+
80
+ ### PR 1: <Title> (Size: S/M/L)
81
+ - **Scope**: <files/components affected>
82
+ - **Depends on**: None (base)
83
+ - **Description**: <what this PR does>
84
+ - **Worktree**: `<branch-name>` (`.claude/scripts/worktree-create.sh <name>`)
85
+
86
+ ### PR 2: <Title> (Size: S/M/L)
87
+ - **Scope**: <files/components affected>
88
+ - **Depends on**: PR 1
89
+ - **Description**: <what this PR does>
90
+ - **Worktree**: `<branch-name>`
91
+
92
+ [Continue for additional PRs...]
93
+
94
+ ## Dependency Graph
95
+ PR 1 -> PR 2 -> PR 3
96
+ \-> PR 4 (can parallel with PR 3)
97
+
98
+ ## Implementation Order
99
+ 1. Start with PR 1
100
+ 2. After PR 1 is approved, start PR 2
101
+ 3. ...
102
+
103
+ ## Notes
104
+ - <any caveats, alternatives, or considerations>
105
+ - <potential risks or areas needing clarification>
106
+ ```
107
+
108
+ ## Example
109
+
110
+ For a task "Add MCP tool interface to environments":
111
+
112
+ ```markdown
113
+ ## PR Stack for: Add MCP tool interface to environments
114
+
115
+ ### PR 1: Add MCP tool base types (Size: S)
116
+ - **Scope**: `src/openenv/core/mcp/`
117
+ - **Depends on**: None
118
+ - **Description**: Add MCPTool, MCPToolResult base classes
119
+ - **Worktree**: `mcp-types`
120
+
121
+ ### PR 2: Add MCP tool registry (Size: M)
122
+ - **Scope**: `src/openenv/core/mcp/`, `src/openenv/core/environment.py`
123
+ - **Depends on**: PR 1
124
+ - **Description**: Tool registry, environment integration
125
+ - **Worktree**: `mcp-registry`
126
+
127
+ ### PR 3: Add MCP tools to echo_env (Size: M)
128
+ - **Scope**: `envs/echo_env/`
129
+ - **Depends on**: PR 2
130
+ - **Description**: Reference implementation of MCP tools
131
+ - **Worktree**: `mcp-echo`
132
+
133
+ ### PR 4: Documentation and tests (Size: M)
134
+ - **Scope**: `docs/`, `tests/`
135
+ - **Depends on**: PR 3
136
+ - **Description**: User docs, comprehensive tests
137
+ - **Worktree**: `mcp-docs`
138
+
139
+ ## Dependency Graph
140
+ PR 1 -> PR 2 -> PR 3 -> PR 4
141
+
142
+ ## Implementation Order
143
+ 1. PR 1: Types (can merge quickly)
144
+ 2. PR 2: Registry (core logic)
145
+ 3. PR 3: Reference implementation
146
+ 4. PR 4: Documentation & tests
147
+
148
+ ## Notes
149
+ - Consider adding tests in each PR for the new code
150
+ - MCP config should follow RFC 001 dual-interface model
151
+ ```
.claude/agents/tester.md ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: tester
3
+ description: Expert test writer focused on high-signal, non-redundant tests
4
+ tools:
5
+ - Bash
6
+ - Read
7
+ - Write
8
+ - Edit
9
+ - Grep
10
+ - Glob
11
+ model: sonnet
12
+ ---
13
+
14
+ # Tester Agent
15
+
16
+ ## Purpose
17
+
18
+ Write high-signal, non-redundant tests. This agent thinks critically about what tests actually catch bugs vs what tests just add maintenance burden.
19
+
20
+ ## Philosophy
21
+
22
+ ### High-Signal Tests
23
+
24
+ A test is high-signal if it:
25
+ - Catches a bug that could actually happen in production
26
+ - Tests behavior that's easy to break during refactoring
27
+ - Covers an edge case that's non-obvious from the implementation
28
+ - Validates a complex state machine or multi-step flow
29
+
30
+ ### Low-Signal Tests (Avoid)
31
+
32
+ - Tests that verify `list.append` works
33
+ - Tests that duplicate another test with trivial variation
34
+ - Tests for code paths that are already covered by integration tests
35
+ - Boundary tests for no-op cases (unless documenting important behavior)
36
+
37
+ ### Redundancy Detection
38
+
39
+ Before writing a test, ask:
40
+ 1. Is this behavior already tested by another test?
41
+ 2. Would a failure here also cause another test to fail?
42
+ 3. Does this test add coverage the integration tests don't have?
43
+
44
+ ## Testing Hierarchy
45
+
46
+ Reference: `.claude/docs/TESTING_STRATEGY.md`
47
+
48
+ 1. **Unit tests** - Pure functions, Pydantic validation, state mutations
49
+ 2. **Integration tests** - Client-server interaction, WebSocket protocol
50
+ 3. **E2E tests** - Full environment lifecycle (reset, step, step, ...)
51
+ 4. **Environment validation** - Structure and invariant checks
52
+
53
+ ## Edge Cases to Consider
54
+
55
+ ### State Management
56
+ - Empty state / default values
57
+ - Maximum capacity / overflow
58
+ - Concurrent access (if applicable)
59
+ - State after error recovery
60
+
61
+ ### Input Handling
62
+ - Empty input
63
+ - Unicode / multi-byte characters
64
+ - Very long input
65
+ - Malformed input (Pydantic validation)
66
+
67
+ ### Protocol / Events
68
+ - Out-of-order messages
69
+ - Duplicate messages
70
+ - Missing messages in sequence
71
+ - Timeout / connection drops
72
+
73
+ ### Python-Specific
74
+ - None values where not expected
75
+ - Type mismatches (runtime vs static)
76
+ - Pydantic validation errors
77
+ - Async/await edge cases
78
+
79
+ ## Process
80
+
81
+ ### 1. Analyze Target Code
82
+
83
+ ```bash
84
+ # Find the code to test
85
+ cat <file>
86
+
87
+ # Check existing tests
88
+ PYTHONPATH=src:envs uv run pytest tests/ --collect-only 2>&1 | grep "test_"
89
+ ```
90
+
91
+ ### 2. Identify Gaps
92
+
93
+ - What edge cases aren't covered?
94
+ - What state transitions lack tests?
95
+ - What error paths are untested?
96
+
97
+ ### 3. Prioritize by Signal
98
+
99
+ Rate each potential test:
100
+ - **High**: Would catch real bugs, tests complex logic
101
+ - **Medium**: Documents behavior, catches regression
102
+ - **Low**: Trivial, redundant, or over-specified
103
+
104
+ Only write High and some Medium tests.
105
+
106
+ ### 4. Write Minimal Tests
107
+
108
+ - One assertion per behavior (when possible)
109
+ - Clear test names that describe the scenario
110
+ - Use fixtures to reduce boilerplate
111
+ - Group related tests in classes
112
+
113
+ ### 5. Verify Tests FAIL
114
+
115
+ After writing, verify tests fail (proving they test something real):
116
+ ```bash
117
+ PYTHONPATH=src:envs uv run pytest tests/path/test_file.py -v
118
+ ```
119
+
120
+ ## Output Format
121
+
122
+ ```markdown
123
+ ## Test Analysis for <target>
124
+
125
+ ### Coverage Gaps Identified
126
+ 1. [Gap description] - Priority: High/Medium/Low
127
+ 2. ...
128
+
129
+ ### Tests Written
130
+ | Test Name | Signal | Rationale |
131
+ |-----------|--------|-----------|
132
+ | test_foo_edge_case | High | Catches off-by-one in boundary |
133
+ | test_bar_error_path | Medium | Documents error behavior |
134
+
135
+ ### Tests NOT Written (and why)
136
+ - test_trivial_case: Already covered by test_foo
137
+ - test_obvious_behavior: Implementation makes this impossible
138
+
139
+ ### Redundancy Check
140
+ - Verified no overlap with existing tests: [list checked]
141
+ - New tests add coverage for: [specific gaps filled]
142
+
143
+ ### Verification
144
+ All tests FAIL as expected (no implementation yet).
145
+ ```
146
+
147
+ ## Anti-Patterns to Avoid
148
+
149
+ 1. **Over-mocking**: Don't mock things that are fast and deterministic
150
+ 2. **Testing implementation**: Test behavior, not internal structure
151
+ 3. **Flaky setup**: Tests should work with simple fixtures when possible
152
+ 4. **Assertion overload**: One test, one behavior
153
+ 5. **Copy-paste tests**: If tests are similar, parameterize with `@pytest.mark.parametrize`
.claude/docs/CONTRIBUTING.md ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributing with Claude Code
2
+
3
+ OpenEnv is an agentic-first project. We expect most contributions to use Claude Code or similar tools. This document describes the workflow.
4
+
5
+ ## The Two-Phase Model
6
+
7
+ ### Phase 1: Design & Alignment (Human-Owned)
8
+
9
+ Humans own the "what" and "why":
10
+ - Major architectural decisions require RFCs
11
+ - Discuss trade-offs in issues before implementation
12
+ - Establish acceptance criteria and invariants
13
+ - Review for alignment, not just correctness
14
+
15
+ ### Phase 2: Implementation (Claude-Owned)
16
+
17
+ Claude handles the mechanical loop:
18
+ ```
19
+ while not working:
20
+ try_some_shit()
21
+ test()
22
+ ```
23
+
24
+ Humans intervene only for alignment questions.
25
+
26
+ ## TDD Workflow
27
+
28
+ OpenEnv uses Test-Driven Development (TDD) enforced through Claude Code hooks.
29
+
30
+ ### Quick Start
31
+
32
+ ```bash
33
+ # Start working on an issue with TDD enforcement
34
+ /work-on-issue #42
35
+
36
+ # Or create a plain worktree (no TDD — free editing)
37
+ .claude/scripts/worktree-create.sh my-feature
38
+ cd .worktrees/my-feature
39
+ ```
40
+
41
+ ### The Red-Green-Refactor Cycle
42
+
43
+ 1. **Red**: `/write-tests` - Create failing tests that encode requirements
44
+ 2. **Green**: `/implement` - Write minimal code to make tests pass
45
+ 3. **Docs**: `/update-docs` - Fix stale references across the repo
46
+ 4. **Refactor**: `/simplify` - Clean up without changing behavior
47
+ 5. **Validate**: `/pre-submit-pr` - Ensure everything passes before PR
48
+
49
+ ### When to Use TDD Mode
50
+
51
+ TDD is opt-in — it is activated only by `/work-on-issue`, not by being in a worktree.
52
+
53
+ **Use TDD (`/work-on-issue`) for:**
54
+ - New features with clear acceptance criteria
55
+ - Bug fixes where you can write a failing test first
56
+ - Refactoring where tests ensure nothing breaks
57
+
58
+ **Skip TDD (stay in main repo or use a plain worktree) for:**
59
+ - Quick exploration and prototyping
60
+ - Documentation updates
61
+ - Simple config changes
62
+ - Discussing approaches before implementing
63
+
64
+ ### Multi-Issue Work
65
+
66
+ For parallel work on a batch of issues:
67
+ ```bash
68
+ /sprint 67,68,69
69
+ ```
70
+ This uses Agent Teams (if enabled) to work on all issues in parallel,
71
+ each in its own worktree with TDD enforcement, then creates stacked PRs.
72
+ Without Agent Teams, it prepares worktrees and requirements for manual work.
73
+
74
+ ### Bypassing TDD
75
+
76
+ When TDD is active, say "skip TDD" in your message to bypass the edit blocking.
77
+ This is useful for:
78
+ - Fixing typos in code you just wrote
79
+ - Making quick adjustments during iteration
80
+ - Emergency hotfixes
81
+
82
+ To deactivate TDD entirely: `bash .claude/hooks/tdd-deactivate.sh`
83
+
84
+ ## When to Write an RFC
85
+
86
+ **Required for:**
87
+ - New core APIs in `src/openenv/core/`
88
+ - Breaking changes to existing APIs
89
+ - Major architectural decisions
90
+ - New abstractions or design patterns
91
+ - Changes affecting the two-interface model (WebSocket/MCP)
92
+
93
+ **Not required for:**
94
+ - Bug fixes, documentation, minor refactoring
95
+ - New example environments (unless introducing new patterns)
96
+ - Dependency updates, test additions
97
+
98
+ See `rfcs/README.md` for the RFC process.
99
+
100
+ ## Review Expectations
101
+
102
+ ### What Claude Catches (Tier 1)
103
+ - Bugs, uninitialized variables, type errors
104
+ - Lint failures, test failures
105
+ - Security issues (credential exposure, injection)
106
+ - Debug code left in (print statements, breakpoints)
107
+
108
+ ### What Humans Review (Tier 2)
109
+ - Does this align with our principles in PRINCIPLES.md?
110
+ - Does this maintain our invariants in INVARIANTS.md?
111
+ - Is this the right trade-off for the project?
112
+ - Should this decision be documented in an RFC?
113
+
114
+ ### Alignment Flags
115
+
116
+ When Claude identifies a potential alignment issue, it formats as:
117
+ ```
118
+ **ALIGNMENT FLAG**: [Brief description]
119
+ - **Principle at stake**: [Which principle]
120
+ - **The concern**: [What seems misaligned]
121
+ - **Suggested reviewer**: @[maintainer]
122
+ ```
123
+
124
+ ## Available Tools
125
+
126
+ For the full list of available skills, subagents, and recommended plugins, see [CLAUDE.md](../../CLAUDE.md#available-skills).
.claude/docs/INVARIANTS.md ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # System Invariants
2
+
3
+ These invariants must NEVER be violated. If a change would violate them, stop and flag for human review.
4
+
5
+ ## API Invariants
6
+
7
+ 1. **Gymnasium API signatures**
8
+ - `reset(seed?, episode_id?) -> Observation`
9
+ - `step(action) -> Observation`
10
+ - `state -> State`
11
+ - These signatures must not change without a major version bump
12
+
13
+ 2. **Generic type safety**
14
+ - All environments must use `Environment[ActT, ObsT, StateT]` generics
15
+ - All clients must use `EnvClient[ActT, ObsT, StateT]` generics
16
+ - Types must match between client and server
17
+
18
+ 3. **Pydantic serialization**
19
+ - All wire types (Action, Observation, State) must be Pydantic models
20
+ - Serialization must be JSON-compatible
21
+
22
+ ## Security Invariants
23
+
24
+ 1. **Agent isolation**
25
+ - Agents cannot access reset/simulation controls
26
+ - The WebSocket interface for reset/step is for orchestration only
27
+ - MCP tools must not expose simulation control to agents
28
+
29
+ 2. **Container isolation**
30
+ - Environments run in isolated Docker containers
31
+ - Containers must not have access to host filesystem (except explicitly mounted volumes)
32
+ - Network access must be explicitly configured
33
+
34
+ 3. **No credential exposure**
35
+ - Never log API keys, tokens, or secrets
36
+ - Never include credentials in error messages
37
+ - Use environment variables for sensitive configuration
38
+
39
+ ## Architectural Invariants
40
+
41
+ 1. **Dual API boundary** (see RFC 001, RFC 004)
42
+
43
+ OpenEnv exposes two distinct APIs to two different boundaries:
44
+
45
+ | Boundary | API | Purpose |
46
+ |----------|-----|---------|
47
+ | **Agent** | MCP (Model Context Protocol) | Tools the agent uses to interact with the environment |
48
+ | **Infrastructure** | Gym-like (`reset`, `step`, `state`) | Simulation control for training orchestration |
49
+
50
+ **Critical**: The Gym-like API is NOT accessible to the agent being trained.
51
+
52
+ **Why?** The agent must not be able to call `reset()`. If an agent could reset after crashing a car, it would learn that consequences are reversible - which breaks the training paradigm. The infrastructure calls `reset()` to clean up for the next episode, but from the agent's perspective, the episode simply ends.
53
+
54
+ **Violations to flag:**
55
+ - Exposing `reset()`, `step()`, or `state()` via MCP tools
56
+ - Giving agents direct access to the Gym-like WebSocket API
57
+ - Any mechanism that lets an agent trigger simulation control
58
+
59
+ 2. **Client-server separation**
60
+ - Clients must never import from `server/` directory
61
+ - Server code must never import client code
62
+ - Shared code goes in `models.py`
63
+
64
+ 3. **Rewards in environment**
65
+ - Reward computation must stay inside environment boundary
66
+ - External reward augmentation uses Transform pipeline
67
+ - Transforms are server-side only
68
+
69
+ 4. **Communication patterns**
70
+ - WebSocket for all environment communication (Gym-like API + metadata)
71
+ - No custom protocols
72
+
73
+ **Note**: We are in the process of deprecating HTTP (see PR #252) in favor of WebSocket-only, but we are still transitioning and both protocols are currently available.
74
+
75
+ ## Breaking Change Policy
76
+
77
+ - **Pre-1.0**: Breaking changes acceptable if documented in release notes
78
+ - **Post-1.0**: Semantic versioning strictly enforced
79
+ - MAJOR: Breaking changes
80
+ - MINOR: New features, backward compatible
81
+ - PATCH: Bug fixes only
82
+
83
+ ## Violation Response
84
+
85
+ If you identify a potential invariant violation:
86
+
87
+ 1. **Stop** - Do not proceed with the change
88
+ 2. **Flag** - Create an ALIGNMENT FLAG with:
89
+ - Which invariant is at risk
90
+ - Why the change might violate it
91
+ - Suggested reviewer
92
+ 3. **Wait** - Get human approval before proceeding
93
+
94
+ Example:
95
+ ```
96
+ **ALIGNMENT FLAG**: Client importing server module
97
+ - **Invariant at risk**: Client-server separation
98
+ - **The concern**: client.py imports from server/environment.py
99
+ - **Suggested reviewer**: @darktex
100
+ ```
.claude/docs/PATTERNS.md ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Code Patterns & Conventions
2
+
3
+ This document describes the canonical patterns for OpenEnv code. Follow these patterns for consistency.
4
+
5
+ ## Environment Structure
6
+
7
+ Every environment follows this structure:
8
+ ```
9
+ my_env/
10
+ ├── __init__.py # Export Action, Observation, Client
11
+ ├── models.py # Action, Observation, State (Pydantic)
12
+ ├── client.py # EnvClient[ActT, ObsT, StateT] subclass
13
+ ├── openenv.yaml # Environment manifest
14
+ ├── pyproject.toml # Dependencies
15
+ └── server/
16
+ ├── my_environment.py # Environment[ActT, ObsT, StateT] subclass
17
+ ├── app.py # create_app() with HTTPEnvServer
18
+ ├── requirements.txt # Docker dependencies
19
+ └── Dockerfile
20
+ ```
21
+
22
+ Use `openenv init <name>` to scaffold this structure.
23
+
24
+ ## Type Safety Pattern
25
+
26
+ Always use generics for type safety across the wire:
27
+
28
+ ```python
29
+ # models.py
30
+ from pydantic import BaseModel
31
+
32
+ class MyAction(BaseModel):
33
+ command: str
34
+
35
+ class MyObservation(BaseModel):
36
+ result: str
37
+ reward: float
38
+ done: bool
39
+
40
+ class MyState(BaseModel):
41
+ episode_id: str
42
+ step_count: int
43
+ ```
44
+
45
+ ```python
46
+ # client.py
47
+ from openenv.core import EnvClient, StepResult
48
+
49
+ class MyEnv(EnvClient[MyAction, MyObservation, MyState]):
50
+ def _step_payload(self, action: MyAction) -> dict:
51
+ return action.model_dump()
52
+
53
+ def _parse_result(self, payload: dict) -> StepResult[MyObservation]:
54
+ obs = MyObservation(**payload["observation"])
55
+ return StepResult(observation=obs, reward=obs.reward, done=obs.done)
56
+
57
+ def _parse_state(self, payload: dict) -> MyState:
58
+ return MyState(**payload)
59
+ ```
60
+
61
+ ```python
62
+ # server/my_environment.py
63
+ from openenv.core.env_server import Environment
64
+
65
+ class MyEnvironment(Environment[MyAction, MyObservation, MyState]):
66
+ def reset(self, seed=None, episode_id=None) -> MyObservation:
67
+ ...
68
+
69
+ def step(self, action: MyAction) -> MyObservation:
70
+ ...
71
+
72
+ @property
73
+ def state(self) -> MyState:
74
+ ...
75
+ ```
76
+
77
+ ## Pydantic Models
78
+
79
+ - All wire types must be Pydantic models
80
+ - Use `Field()` for validation constraints
81
+ - Enable `arbitrary_types_allowed` for numpy/torch types
82
+
83
+ ```python
84
+ from pydantic import BaseModel, Field
85
+ import numpy as np
86
+
87
+ class MyObservation(BaseModel):
88
+ class Config:
89
+ arbitrary_types_allowed = True
90
+
91
+ grid: np.ndarray
92
+ score: float = Field(ge=0.0)
93
+ ```
94
+
95
+ ## Error Handling
96
+
97
+ - Return error info in observations, don't raise exceptions
98
+ - Use `done=True` with error observation for fatal errors
99
+ - Reserve exceptions for truly exceptional cases (server crashes)
100
+
101
+ ```python
102
+ def step(self, action: MyAction) -> MyObservation:
103
+ try:
104
+ result = self._execute(action)
105
+ return MyObservation(result=result, error=None, done=False)
106
+ except InvalidAction as e:
107
+ return MyObservation(result="", error=str(e), done=False)
108
+ except FatalError as e:
109
+ return MyObservation(result="", error=str(e), done=True)
110
+ ```
111
+
112
+ ## Reward Computation
113
+
114
+ Rewards are computed inside the environment, not externally:
115
+
116
+ ```python
117
+ def step(self, action: MyAction) -> MyObservation:
118
+ # Execute action
119
+ new_state = self._apply_action(action)
120
+
121
+ # Compute reward inside environment
122
+ reward = self._compute_reward(new_state)
123
+
124
+ return MyObservation(
125
+ state=new_state,
126
+ reward=reward,
127
+ done=self._is_terminal(new_state)
128
+ )
129
+ ```
130
+
131
+ ## FastAPI App Pattern
132
+
133
+ ```python
134
+ # server/app.py
135
+ from openenv.core.env_server import create_app
136
+ from .my_environment import MyEnvironment
137
+ from ..models import MyAction, MyObservation
138
+
139
+ env = MyEnvironment()
140
+ app = create_app(env, MyAction, MyObservation)
141
+ ```
.claude/docs/PRINCIPLES.md ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # OpenEnv Design Principles
2
+
3
+ This document encodes the shared alignment between contributors on what OpenEnv optimizes for, what we trade off, and key decisions we've made.
4
+
5
+ ## Core Principles (from RFC 000)
6
+
7
+ 1. **Minimize lifecycle deltas**: Training → Evals → Production should use identical interfaces
8
+ 2. **Minimize human-agent divergence**: Tools that work for humans should work for agents
9
+ 3. **Be hands-on**: Provide ready-to-use implementations, not just specs
10
+ 4. **Design for LLMs**: Optimize for context efficiency, in-distribution behavior
11
+
12
+ ## What We Optimize For
13
+
14
+ - **Simple Gymnasium-style API** (`reset`, `step`, `state`) - familiar to RL practitioners
15
+ - **Container isolation** for reproducibility and security
16
+ - **Type safety** with generics and Pydantic across the wire
17
+ - **Production-readiness** from day one - training and production use same interfaces
18
+
19
+ ## What We Trade Off
20
+
21
+ - **Flexibility for simplicity**: One canonical way to build environments
22
+ - **Performance for isolation**: Docker overhead is acceptable for reproducibility
23
+ - **Cutting-edge for stability**: FastAPI over experimental frameworks
24
+
25
+ ## Key Decisions Made
26
+
27
+ These decisions are documented in RFCs and should not be changed without a new RFC:
28
+
29
+ | Decision | Rationale | RFC |
30
+ |----------|-----------|-----|
31
+ | **Rewards inside environment** | Domain knowledge encapsulated in env, not external | 002 |
32
+ | **Agents cannot reset** | Prevents learning that consequences are reversible | 001 |
33
+ | **MCP as universal standard** | All agent-environment tool interaction via MCP | 003 |
34
+ | **WebSocket for step loop** | Lower latency than HTTP per-step | 002 |
35
+ | **Two-interface model** | WebSocket for orchestration, MCP for agent tools | 001 |
36
+ | **One env = one trajectory** | Batching via environment stacking, not multiplexing | 004 |
37
+
38
+ **One env = one trajectory**: Environments do not support multiplexed trajectories. To generate batches, stack multiple environment instances. Helpers like `EnvPool` orchestrate batch collection across the stack. Multiplexing is left to future work.
39
+
40
+ ## When to Revisit These Principles
41
+
42
+ - If a principle blocks a valid use case, open an RFC discussion
43
+ - If production experience contradicts a trade-off, document and propose changes
44
+ - Pre-1.0: Breaking changes acceptable with documentation
45
+ - Post-1.0: Semantic versioning strictly enforced
.claude/docs/REPO_WALKTHROUGH.md ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Repository Walkthrough
2
+
3
+ This document provides a navigational guide to the OpenEnv codebase.
4
+
5
+ ## Top-Level Structure
6
+
7
+ ```
8
+ OpenEnv/
9
+ ├── CLAUDE.md # Entry point for Claude Code - build commands, architecture overview
10
+ ├── README.md # Project overview and getting started
11
+ ├── pyproject.toml # Python package configuration (uv/pip)
12
+ ├── uv.lock # Locked dependencies
13
+
14
+ ├── src/ # Core library code (installed as `openenv`)
15
+ ├── envs/ # Example environments (not installed, used via PYTHONPATH)
16
+ ├── tests/ # Test suite
17
+ ├── examples/ # Usage examples and tutorials
18
+ ├── docs/ # Documentation (Sphinx)
19
+ ├── rfcs/ # Design documents and architectural decisions
20
+ ├── scripts/ # Utility scripts
21
+
22
+ ├── .claude/ # Claude Code configuration (skills, agents, docs)
23
+ ├── .github/ # GitHub Actions, PR templates, issue templates
24
+ └── .gitignore
25
+ ```
26
+
27
+ ## Source Code (`src/`)
28
+
29
+ ```
30
+ src/
31
+ ├── openenv/ # Main package
32
+ │ ├── __init__.py
33
+ │ │
34
+ │ ├── core/ # Core abstractions - the heart of OpenEnv
35
+ │ │ ├── env_client.py # EnvClient base class (WebSocket client)
36
+ │ │ ├── client_types.py # Client-side type definitions
37
+ │ │ ├── utils.py # Shared utilities
38
+ │ │ │
39
+ │ │ ├── env_server/ # Server-side components
40
+ │ │ │ ├── interfaces.py # Environment abstract base class
41
+ │ │ │ ├── http_server.py # HTTPEnvServer (FastAPI + WebSocket)
42
+ │ │ │ ├── types.py # Wire types (Action, Observation, State, WS messages)
43
+ │ │ │ ├── serialization.py # Pydantic serialization helpers
44
+ │ │ │ ├── base_transforms.py # Transform pipeline for rewards/observations
45
+ │ │ │ ├── web_interface.py # Web UI for debugging environments
46
+ │ │ │ ├── route_config.py # FastAPI route configuration
47
+ │ │ │ └── exceptions.py # Server-side exceptions
48
+ │ │ │
49
+ │ │ ├── containers/ # Container lifecycle management
50
+ │ │ │ ├── runtime/ # Provider implementations
51
+ │ │ │ │ ├── providers.py # ContainerProvider/RuntimeProvider ABCs + LocalDockerProvider
52
+ │ │ │ │ ├── daytona_provider.py # DaytonaProvider (Daytona cloud sandboxes)
53
+ │ │ │ │ └── uv_provider.py # UVProvider (for local dev)
54
+ │ │ │ └── images/ # Base Docker images
55
+ │ │ │ └── Dockerfile # openenv-base image
56
+ │ │ │
57
+ │ │ └── tools/ # Reusable tool implementations
58
+ │ │ ├── local_python_executor.py # Python code execution
59
+ │ │ └── git_server_client.py # Git operations
60
+ │ │
61
+ │ └── cli/ # Command-line interface
62
+ │ ├── __main__.py # Entry point (`python -m openenv.cli`)
63
+ │ ├── commands/ # CLI subcommands
64
+ │ │ ├── init.py # `openenv init` - scaffold new env
65
+ │ │ ├── serve.py # `openenv serve` - run server locally
66
+ │ │ ├── build.py # `openenv build` - build Docker image
67
+ │ │ ├── push.py # `openenv push` - deploy to HF Spaces
68
+ │ │ └── validate.py # `openenv validate` - check config
69
+ │ └── templates/ # Scaffolding templates
70
+ │ └── openenv_env/ # Template for `openenv init`
71
+
72
+ └── openenv_core/ # Legacy compatibility shim (imports from openenv.core)
73
+ ```
74
+
75
+ ## Environments (`envs/`)
76
+
77
+ Each environment follows a consistent structure:
78
+
79
+ ```
80
+ envs/
81
+ ├── echo_env/ # Minimal reference environment
82
+ │ ├── client.py # EnvClient subclass
83
+ │ ├── models.py # Action, Observation, State models
84
+ │ ├── openenv.yaml # Environment manifest
85
+ │ ├── pyproject.toml # Environment-specific dependencies
86
+ │ ├── README.md
87
+ │ └── server/
88
+ │ ├── app.py # FastAPI app setup
89
+ │ ├── echo_environment.py # Environment implementation
90
+ │ └── Dockerfile # Container definition
91
+
92
+ ├── coding_env/ # Python code execution environment
93
+ ├── chat_env/ # Conversational environment
94
+ ├── textarena_env/ # Text-based games (TextArena)
95
+ ├── browsergym_env/ # Browser automation (BrowserGym)
96
+ ├── openspiel_env/ # Game theory environments (OpenSpiel)
97
+ ├── atari_env/ # Atari games via Gymnasium
98
+ ├── finrl_env/ # Financial RL environment
99
+ ├── git_env/ # Git operations environment
100
+ ├── snake_env/ # Classic Snake game
101
+ ├── sumo_rl_env/ # Traffic simulation (SUMO)
102
+ ├── connect4_env/ # Connect Four game
103
+ ├── dipg_safety_env/ # Safety-focused environment
104
+ ├── reasoning_gym_env/ # Reasoning problems and puzzles
105
+ └── websearch_env/ # Web search environment
106
+ ```
107
+
108
+ ## Tests (`tests/`)
109
+
110
+ ```
111
+ tests/
112
+ ├── conftest.py # Pytest fixtures
113
+ ├── test_*.py # Core library tests
114
+
115
+ ├── envs/ # Per-environment integration tests
116
+ │ ├── test_echo_environment.py
117
+ │ ├── test_coding_environment.py
118
+ │ └── ...
119
+
120
+ ├── test_cli/ # CLI command tests
121
+ └── scripts/ # Test utility scripts
122
+ ```
123
+
124
+ ## RFCs (`rfcs/`)
125
+
126
+ Design documents that capture architectural decisions:
127
+
128
+ ```
129
+ rfcs/
130
+ ├── README.md # RFC process and template
131
+ ├── 000-project-phases.md # Project vision and phases
132
+ ├── 001-abstractions.md # Core abstractions (Environment, Client, two-interface model)
133
+ ├── 002-env-spec.md # Environment specification
134
+ └── 003-mcp-support.md # MCP integration design
135
+ ```
136
+
137
+ ## Claude Code Configuration (`.claude/`)
138
+
139
+ ```
140
+ .claude/
141
+ ├── docs/ # Alignment documents
142
+ │ ├── PRINCIPLES.md # Design principles and trade-offs
143
+ │ ├── INVARIANTS.md # System invariants (must never violate)
144
+ │ ├── PATTERNS.md # Code patterns and conventions
145
+ │ ├── CONTRIBUTING.md # Agentic contribution workflow
146
+ │ └── REPO_WALKTHROUGH.md # This file
147
+
148
+ ├── skills/ # Auto-discovered skills
149
+ │ ├── alignment-review/
150
+ │ │ └── SKILL.md # Two-tier code review
151
+ │ ├── implement/
152
+ │ │ └── SKILL.md # Make tests pass (Green phase)
153
+ │ ├── pre-submit-pr/
154
+ │ │ └── SKILL.md # PR readiness validation
155
+ │ ├── rfc-check/
156
+ │ │ └── SKILL.md # RFC requirement analysis
157
+ │ ├── simplify/
158
+ │ │ └── SKILL.md # Refactor after tests pass
159
+ │ ├── sprint/
160
+ │ │ └── SKILL.md # Parallel multi-issue batch (Agent Teams)
161
+ │ ├── update-docs/
162
+ │ │ └── SKILL.md # Fix stale docs after API changes
163
+ │ ├── watch-pr/
164
+ │ │ └── SKILL.md # Monitor CI + Greptile review after PR
165
+ │ ├── work-on-issue/
166
+ │ │ └── SKILL.md # Start TDD on a single issue
167
+ │ └── write-tests/
168
+ │ └── SKILL.md # Write failing tests (Red phase)
169
+
170
+ ├── agents/ # Specialized subagents
171
+ │ ├── alignment-reviewer.md # Review for bugs + alignment
172
+ │ ├── build-validator.md # Validate builds
173
+ │ ├── docs-updater.md # Fix stale docs after API changes
174
+ │ ├── env-validator.md # Validate environments e2e
175
+ │ ├── implementer.md # Make tests pass with minimal code
176
+ │ ├── issue-worker.md # Extract requirements from GitHub issues
177
+ │ ├── openenv-architect.md # Design new features
178
+ │ ├── pr-planner.md # Plan stacked PRs for complex features
179
+ │ └── tester.md # Write high-signal, failing tests
180
+
181
+ └── hooks/ # Automation scripts
182
+ ├── lint.sh # Run ruff format check
183
+ ├── test.sh # Run pytest
184
+ ├── check-debug.sh # Find debug code
185
+ ├── post-push-pr.sh # Validate PR after push (freshness, CI, conflicts)
186
+ ├── tdd-state.sh # Shared TDD state helpers (is_tdd_active, activate, deactivate)
187
+ ├── tdd-deactivate.sh # Standalone TDD deactivation script
188
+ ├── install.sh # Install git hooks (pre-commit, pre-push, etc.)
189
+ ├── session-start.sh # SessionStart banner (3-state: TDD/worktree/explore)
190
+ ├── no-direct-code.sh # PreToolUse: block direct edits when TDD active
191
+ ├── pre-commit-check.sh # PreToolUse: warn on git commit in TDD mode
192
+ ├── pre-pr-check.sh # PreToolUse: block gh pr create if branch stale
193
+ ├── delegate-todos.sh # PostToolUse: TDD workflow reminder on TodoWrite
194
+ ├── after-tester.sh # SubagentStop: next steps after tester
195
+ ├── after-implementer.sh # SubagentStop: next steps after implementer
196
+ ├── ci-wait.sh # CI polling: block until checks complete or timeout
197
+ └── after-docs-updater.sh # SubagentStop: next steps after docs-updater
198
+ ```
199
+
200
+ ## Documentation (`docs/`)
201
+
202
+ Sphinx-based documentation:
203
+
204
+ ```
205
+ docs/
206
+ ├── Makefile # Sphinx build targets (html, html-noplot, html-stable)
207
+ ├── README.md # Local build instructions
208
+
209
+ └── source/ # Sphinx source root
210
+ ├── conf.py # Sphinx configuration
211
+ ├── index.md # Home page
212
+ ├── core.md # Core API reference (autodoc)
213
+ ├── cli.md # CLI reference (autodoc)
214
+ ├── auto_discovery.md # Auto-discovery API docs
215
+ ├── customizing-web-ui.md # Web UI customization guide
216
+ ├── environments.md # Environments catalog page
217
+
218
+ ├── environments/ # Per-environment documentation
219
+ │ ├── echo.md
220
+ │ ├── coding.md
221
+ │ └── ...
222
+
223
+ ├── getting_started/ # Sphinx Gallery executable tutorials
224
+ │ ├── plot_01_introduction_quickstart.py
225
+ │ ├── plot_02_using_environments.py
226
+ │ ├── plot_03_building_environments.py
227
+ │ ├── contributing-envs.md
228
+ │ └── environment-builder.md
229
+
230
+ ├── tutorials/ # Additional tutorials
231
+ │ ├── openenv-tutorial.md
232
+ │ ├── wordle-grpo.md
233
+ │ └── rl-training-2048.md
234
+
235
+ └── _static/ # Static assets (versions.json, etc.)
236
+ ```
237
+
238
+ ## Key Files to Know
239
+
240
+ | File | Purpose |
241
+ |------|---------|
242
+ | `src/openenv/core/env_server/interfaces.py` | `Environment` abstract base class |
243
+ | `src/openenv/core/env_client.py` | `EnvClient` WebSocket client |
244
+ | `src/openenv/core/env_server/http_server.py` | `HTTPEnvServer` FastAPI wrapper |
245
+ | `src/openenv/core/env_server/types.py` | All wire types and WebSocket messages |
246
+ | `envs/echo_env/` | Reference implementation - start here |
247
+ | `rfcs/001-abstractions.md` | Core architectural decisions |
248
+ | `.claude/docs/INVARIANTS.md` | Rules that must never be broken |
.claude/docs/TESTING_STRATEGY.md ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # OpenEnv Testing Strategy
2
+
3
+ This document outlines OpenEnv's testing philosophy, hierarchy, and conventions.
4
+
5
+ ## Testing Hierarchy
6
+
7
+ Tests are organized by scope and signal:
8
+
9
+ ### 1. Unit Tests (Fastest, Most Isolated)
10
+
11
+ Test individual functions and classes in isolation.
12
+
13
+ **Good candidates:**
14
+ - Pure functions (e.g., reward calculations)
15
+ - Pydantic model validation
16
+ - State mutations
17
+ - Utility functions
18
+
19
+ **Location:** `tests/` mirroring `src/` structure
20
+
21
+ **Example:**
22
+ ```python
23
+ def test_action_model_validates_required_fields():
24
+ with pytest.raises(ValidationError):
25
+ Action() # Missing required fields
26
+ ```
27
+
28
+ ### 2. Integration Tests (Medium Scope)
29
+
30
+ Test component interactions, especially client-server communication.
31
+
32
+ **Good candidates:**
33
+ - Client-server WebSocket protocol
34
+ - Environment lifecycle (reset → step → step → ...)
35
+ - Type serialization across wire boundary
36
+
37
+ **Location:** `tests/` with `_integration` suffix or in dedicated directories
38
+
39
+ **Example:**
40
+ ```python
41
+ async def test_client_connects_and_resets():
42
+ async with start_server() as server:
43
+ client = EchoEnvClient(server.url)
44
+ obs = await client.reset()
45
+ assert isinstance(obs, EchoObservation)
46
+ ```
47
+
48
+ ### 3. Environment Validation Tests
49
+
50
+ Test that environments follow OpenEnv conventions and invariants.
51
+
52
+ **Good candidates:**
53
+ - File structure validation
54
+ - Type consistency (generics match)
55
+ - Invariant checking (no client→server imports)
56
+
57
+ **Location:** `tests/envs/`
58
+
59
+ **Uses:** `env-validator` agent patterns
60
+
61
+ ### 4. E2E Tests (Slowest, Highest Signal)
62
+
63
+ Test complete workflows from user perspective.
64
+
65
+ **Good candidates:**
66
+ - Full training loop simulation
67
+ - Container lifecycle
68
+ - MCP tool interactions
69
+
70
+ **Location:** `tests/e2e/` (if needed)
71
+
72
+ ## Test Location Conventions
73
+
74
+ ```
75
+ tests/
76
+ ├── conftest.py # Shared fixtures
77
+ ├── core/ # Core library tests
78
+ │ ├── test_environment.py
79
+ │ ├── test_client.py
80
+ │ └── test_server.py
81
+ ├── envs/ # Environment-specific tests
82
+ │ ├── test_echo_environment.py
83
+ │ └── test_<env>_environment.py
84
+ └── e2e/ # End-to-end tests (optional)
85
+ ```
86
+
87
+ ## Running Tests
88
+
89
+ ### Full Suite
90
+ ```bash
91
+ PYTHONPATH=src:envs uv run pytest tests/ -v --tb=short
92
+ ```
93
+
94
+ ### Single File
95
+ ```bash
96
+ PYTHONPATH=src:envs uv run pytest tests/path/test_file.py -v
97
+ ```
98
+
99
+ ### Single Test
100
+ ```bash
101
+ PYTHONPATH=src:envs uv run pytest tests/path/test_file.py::test_name -v
102
+ ```
103
+
104
+ ### Exclude Special Environments
105
+ Some environments require special setup (browser, websearch). The hook script excludes these:
106
+ ```bash
107
+ bash .claude/hooks/test.sh
108
+ ```
109
+
110
+ ## Edge Cases to Consider
111
+
112
+ ### Python-Specific
113
+ - `None` where not expected
114
+ - Type mismatches at runtime (despite type hints)
115
+ - Pydantic `ValidationError` on invalid data
116
+ - Async/await edge cases (timeouts, cancellation)
117
+
118
+ ### State Management
119
+ - Empty state / default values
120
+ - Maximum capacity / overflow
121
+ - State after error recovery
122
+ - Concurrent access patterns
123
+
124
+ ### Protocol / WebSocket
125
+ - Connection drops mid-step
126
+ - Out-of-order messages
127
+ - Malformed JSON payloads
128
+ - Timeout handling
129
+
130
+ ### Pydantic Models
131
+ - Extra fields in input (strict mode)
132
+ - Missing required fields
133
+ - Type coercion behavior
134
+ - Nested model validation
135
+
136
+ ## Test Patterns
137
+
138
+ ### Fixtures for Common Setup
139
+
140
+ ```python
141
+ @pytest.fixture
142
+ def echo_env():
143
+ """Create a fresh EchoEnvironment for each test."""
144
+ return EchoEnvironment()
145
+
146
+ def test_reset_returns_observation(echo_env):
147
+ obs, _ = echo_env.reset()
148
+ assert isinstance(obs, EchoObservation)
149
+ ```
150
+
151
+ ### Async Tests
152
+
153
+ ```python
154
+ import pytest
155
+
156
+ @pytest.mark.asyncio
157
+ async def test_async_client():
158
+ async with create_client() as client:
159
+ result = await client.step(action)
160
+ assert result.done is False
161
+ ```
162
+
163
+ ### Parametrized Tests
164
+
165
+ ```python
166
+ @pytest.mark.parametrize("input,expected", [
167
+ ("hello", "HELLO"),
168
+ ("", ""),
169
+ ("123", "123"),
170
+ ])
171
+ def test_transform(input, expected):
172
+ assert transform(input) == expected
173
+ ```
174
+
175
+ ## What Makes a Good Test
176
+
177
+ ### High-Signal (Write These)
178
+
179
+ - Catches bugs that could happen in production
180
+ - Tests behavior from user perspective
181
+ - Covers non-obvious edge cases
182
+ - Validates complex state machines
183
+
184
+ ### Low-Signal (Avoid These)
185
+
186
+ - Tests that verify Python built-ins work
187
+ - Duplicates of existing tests with trivial variation
188
+ - Tests that mock so much they don't test real behavior
189
+ - Tests for code paths already covered by integration tests
190
+
191
+ ## TDD Workflow
192
+
193
+ The testing strategy integrates with the TDD workflow:
194
+
195
+ 1. **Red**: `/write-tests` creates failing tests
196
+ 2. **Green**: `/implement` makes tests pass
197
+ 3. **Refactor**: `/simplify` cleans up code
198
+ 4. **Validate**: `/pre-submit-pr` runs full suite
199
+
200
+ ## Coverage Gaps (Known)
201
+
202
+ Document known gaps here as they're identified:
203
+
204
+ - [ ] WebSocket reconnection handling
205
+ - [ ] Container lifecycle edge cases
206
+ - [ ] MCP tool error responses (when MCP is added)
207
+
208
+ ## Verification
209
+
210
+ After writing tests, verify with:
211
+
212
+ ```bash
213
+ # Run specific tests
214
+ PYTHONPATH=src:envs uv run pytest tests/path/test_file.py -v
215
+
216
+ # Check coverage (if coverage is set up)
217
+ PYTHONPATH=src:envs uv run pytest tests/ --cov=src/openenv
218
+
219
+ # Run lint to ensure test code is clean
220
+ uv run ruff check tests/
221
+ ```
.claude/hooks/after-docs-updater.sh ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # SubagentStop hook for docs-updater: Suggest next steps
3
+
4
+ echo ""
5
+ echo "Documentation update complete."
6
+ echo ""
7
+ echo "Next steps:"
8
+ echo " - /simplify -> refactor if needed (optional)"
9
+ echo " - /pre-submit-pr -> validate before creating PR"
10
+ echo " - /watch-pr -> monitor CI + review after PR (after pre-submit)"
11
+ echo ""
.claude/hooks/after-implementer.sh ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # SubagentStop hook for implementer: Suggest next steps
3
+
4
+ echo ""
5
+ echo "Implementation complete."
6
+ echo ""
7
+ echo "Next steps:"
8
+ echo " - /update-docs -> fix stale docs if APIs changed"
9
+ echo " - /simplify -> refactor if needed (optional)"
10
+ echo " - Mark todo complete and move to next pending todo"
11
+ echo " - /pre-submit-pr -> validate before creating PR"
12
+ echo ""
.claude/hooks/after-tester.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # SubagentStop hook for tester: Chain to /implement
3
+
4
+ echo ""
5
+ echo "Tests written by tester agent."
6
+ echo ""
7
+ echo "Next step: Run /implement to make the tests pass."
8
+ echo ""
.claude/hooks/check-debug.sh ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Check for debug code that shouldn't be committed
3
+ # Exit code 0 always (informational), but outputs findings
4
+
5
+ # Check for required tools
6
+ if ! command -v rg &> /dev/null; then
7
+ echo "Warning: 'rg' (ripgrep) is not installed, falling back to grep"
8
+ USE_GREP=1
9
+ fi
10
+
11
+ echo "=== Checking for debug code ==="
12
+
13
+ found_issues=0
14
+
15
+ # Check for print statements (allow if marked with # ok-to-print)
16
+ echo ""
17
+ echo "--- Print statements in src/ ---"
18
+ if [ "$USE_GREP" = "1" ]; then
19
+ prints=$(grep -rn "print(" src/ --include="*.py" 2>/dev/null | grep -v "# ok-to-print" || true)
20
+ else
21
+ prints=$(rg -n "print\(" src/ --glob "*.py" 2>/dev/null | grep -v "# ok-to-print" || true)
22
+ fi
23
+
24
+ if [ -n "$prints" ]; then
25
+ echo "$prints"
26
+ found_issues=1
27
+ else
28
+ echo "None found"
29
+ fi
30
+
31
+ # Check for TODO/FIXME/XXX/HACK comments
32
+ echo ""
33
+ echo "--- TODO/FIXME comments in src/ ---"
34
+ if [ "$USE_GREP" = "1" ]; then
35
+ todos=$(grep -rn -E "TODO|FIXME|XXX|HACK" src/ --include="*.py" 2>/dev/null || true)
36
+ else
37
+ todos=$(rg -n "TODO|FIXME|XXX|HACK" src/ --glob "*.py" 2>/dev/null || true)
38
+ fi
39
+
40
+ if [ -n "$todos" ]; then
41
+ echo "$todos"
42
+ found_issues=1
43
+ else
44
+ echo "None found"
45
+ fi
46
+
47
+ # Check for debugger statements
48
+ echo ""
49
+ echo "--- Debugger statements in src/ ---"
50
+ if [ "$USE_GREP" = "1" ]; then
51
+ debuggers=$(grep -rn -E "breakpoint\(\)|pdb\.|ipdb\." src/ --include="*.py" 2>/dev/null || true)
52
+ else
53
+ debuggers=$(rg -n "breakpoint\(\)|pdb\.|ipdb\." src/ --glob "*.py" 2>/dev/null || true)
54
+ fi
55
+
56
+ if [ -n "$debuggers" ]; then
57
+ echo "$debuggers"
58
+ found_issues=1
59
+ else
60
+ echo "None found"
61
+ fi
62
+
63
+ echo ""
64
+ if [ $found_issues -eq 1 ]; then
65
+ echo "=== Debug code found (review before committing) ==="
66
+ else
67
+ echo "=== No debug code found ==="
68
+ fi
69
+
70
+ # Always exit 0 - this is informational
71
+ exit 0
.claude/hooks/check-line-endings.sh ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Check for CRLF line endings in text files
3
+ # Uses portable constructs that work in sandboxed environments
4
+
5
+ set -e
6
+
7
+ # Get the directory to check (default to current directory)
8
+ CHECK_DIR="${1:-.}"
9
+
10
+ # Find all tracked text files with CRLF line endings
11
+ CRLF_FILES=()
12
+
13
+ # Check if we're in a git repository
14
+ if git -C "$CHECK_DIR" rev-parse --git-dir > /dev/null 2>&1; then
15
+ # In a git repo - check only tracked files
16
+ # Use a temp file for portability (avoids process substitution issues in sandboxes)
17
+ TEMP_FILE=$(mktemp)
18
+ trap "rm -f '$TEMP_FILE'" EXIT
19
+
20
+ (cd "$CHECK_DIR" && git ls-files) > "$TEMP_FILE"
21
+
22
+ while IFS= read -r file; do
23
+ # Skip if file doesn't exist
24
+ if [[ ! -f "$file" ]]; then
25
+ continue
26
+ fi
27
+
28
+ # Check if file is binary using git
29
+ if git diff --no-index --numstat /dev/null "$file" 2>/dev/null | grep -q "^-"; then
30
+ continue
31
+ fi
32
+
33
+ # Check for CRLF line endings
34
+ if grep -qU $'\r' "$file" 2>/dev/null; then
35
+ CRLF_FILES+=("$file")
36
+ fi
37
+ done < "$TEMP_FILE"
38
+ else
39
+ # Not a git repo - check all text files
40
+ # Use a temp file for portability
41
+ TEMP_FILE=$(mktemp)
42
+ trap "rm -f '$TEMP_FILE'" EXIT
43
+
44
+ find "$CHECK_DIR" -type f -print > "$TEMP_FILE" 2>/dev/null || true
45
+
46
+ while IFS= read -r file; do
47
+ # Skip if file doesn't exist or is a directory
48
+ if [[ ! -f "$file" ]]; then
49
+ continue
50
+ fi
51
+
52
+ # Simple binary file check - skip files with null bytes
53
+ if grep -qP '\x00' "$file" 2>/dev/null; then
54
+ continue
55
+ fi
56
+
57
+ # Check for CRLF line endings
58
+ if grep -qU $'\r' "$file" 2>/dev/null; then
59
+ CRLF_FILES+=("$file")
60
+ fi
61
+ done < "$TEMP_FILE"
62
+ fi
63
+
64
+ # Report results
65
+ if [[ ${#CRLF_FILES[@]} -gt 0 ]]; then
66
+ echo "ERROR: Found ${#CRLF_FILES[@]} file(s) with CRLF line endings:" >&2
67
+ for file in "${CRLF_FILES[@]}"; do
68
+ echo " - $file" >&2
69
+ done
70
+ echo "" >&2
71
+ echo "To fix, convert these files to LF line endings:" >&2
72
+ echo " dos2unix <file> # or use your editor's line ending conversion" >&2
73
+ exit 1
74
+ fi
75
+
76
+ exit 0
.claude/hooks/ci-wait.sh ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # CI polling script. Blocks until all CI checks complete or timeout.
3
+ #
4
+ # Usage: bash .claude/hooks/ci-wait.sh <PR_NUMBER> [TIMEOUT_SECONDS]
5
+ #
6
+ # Exit codes:
7
+ # 0 - All checks passed
8
+ # 1 - One or more checks failed
9
+ # 2 - Timeout exceeded
10
+ # 3 - Error (could not fetch PR)
11
+ #
12
+ # Polls every 120 seconds. Prints status updates to stdout.
13
+
14
+ set -e
15
+
16
+ PR_NUMBER="${1:?Usage: ci-wait.sh <PR_NUMBER> [TIMEOUT_SECONDS]}"
17
+ TIMEOUT="${2:-1800}"
18
+ POLL_INTERVAL=120
19
+ ELAPSED=0
20
+
21
+ echo ""
22
+ echo "==================================================================="
23
+ echo " CI Wait: Monitoring PR #$PR_NUMBER"
24
+ echo "==================================================================="
25
+ echo " Timeout: ${TIMEOUT}s | Poll interval: ${POLL_INTERVAL}s"
26
+ echo ""
27
+
28
+ while true; do
29
+ # Fetch current check status
30
+ PR_JSON=$(gh pr view "$PR_NUMBER" --json statusCheckRollup 2>/dev/null || true)
31
+ if [[ -z "$PR_JSON" ]]; then
32
+ echo "ERROR: Could not fetch PR #$PR_NUMBER"
33
+ exit 3
34
+ fi
35
+
36
+ CHECK_COUNT=$(echo "$PR_JSON" | jq '.statusCheckRollup | length' 2>/dev/null || echo "0")
37
+
38
+ if [[ "$CHECK_COUNT" -eq 0 ]]; then
39
+ echo "[$(date +%H:%M:%S)] No CI checks found yet. Waiting..."
40
+ else
41
+ PENDING=$(echo "$PR_JSON" | jq '[.statusCheckRollup[] | select(.status != "COMPLETED")] | length' 2>/dev/null || echo "0")
42
+ FAILED_CHECKS=$(echo "$PR_JSON" | jq '[.statusCheckRollup[] | select(.conclusion == "FAILURE")] | length' 2>/dev/null || echo "0")
43
+ PASSED_CHECKS=$(echo "$PR_JSON" | jq '[.statusCheckRollup[] | select(.conclusion == "SUCCESS")] | length' 2>/dev/null || echo "0")
44
+
45
+ echo "[$(date +%H:%M:%S)] Checks: $PASSED_CHECKS passed, $FAILED_CHECKS failed, $PENDING pending (of $CHECK_COUNT)"
46
+
47
+ # If no checks are pending, we have a final result
48
+ if [[ "$PENDING" -eq 0 ]]; then
49
+ echo ""
50
+ if [[ "$FAILED_CHECKS" -gt 0 ]]; then
51
+ echo "==================================================================="
52
+ echo " CI FAILED: $FAILED_CHECKS check(s) failed"
53
+ echo "==================================================================="
54
+ echo ""
55
+ echo "Failed checks:"
56
+ echo "$PR_JSON" | jq -r '.statusCheckRollup[] | select(.conclusion == "FAILURE") | " - \(.name)"'
57
+ echo ""
58
+ exit 1
59
+ elif [[ "$PASSED_CHECKS" -ne "$CHECK_COUNT" ]]; then
60
+ echo "==================================================================="
61
+ echo " CI INCOMPLETE: $((CHECK_COUNT - PASSED_CHECKS - FAILED_CHECKS)) check(s) cancelled/skipped"
62
+ echo "==================================================================="
63
+ echo ""
64
+ echo "Non-success checks:"
65
+ echo "$PR_JSON" | jq -r '.statusCheckRollup[] | select(.conclusion != "SUCCESS" and .conclusion != null) | " - \(.name): \(.conclusion)"'
66
+ echo ""
67
+ exit 1
68
+ else
69
+ echo "==================================================================="
70
+ echo " CI PASSED: All $PASSED_CHECKS check(s) passed"
71
+ echo "==================================================================="
72
+ echo ""
73
+ exit 0
74
+ fi
75
+ fi
76
+ fi
77
+
78
+ # Check timeout
79
+ if [[ "$ELAPSED" -ge "$TIMEOUT" ]]; then
80
+ echo ""
81
+ echo "==================================================================="
82
+ echo " CI TIMEOUT: Exceeded ${TIMEOUT}s waiting for checks"
83
+ echo "==================================================================="
84
+ echo ""
85
+ if [[ "$CHECK_COUNT" -gt 0 ]]; then
86
+ echo "Pending checks:"
87
+ echo "$PR_JSON" | jq -r '.statusCheckRollup[] | select(.status != "COMPLETED") | " - \(.name): \(.status)"'
88
+ echo ""
89
+ fi
90
+ exit 2
91
+ fi
92
+
93
+ # Sleep and increment
94
+ sleep "$POLL_INTERVAL"
95
+ ELAPSED=$((ELAPSED + POLL_INTERVAL))
96
+ done
.claude/hooks/delegate-todos.sh ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # PostToolUse hook for TodoWrite: Remind about TDD workflow when TDD is active
3
+
4
+ # Check if TDD is active
5
+ source "$(dirname "$0")/tdd-state.sh"
6
+ if ! is_tdd_active; then
7
+ exit 0 # TDD not active, no reminder needed
8
+ fi
9
+
10
+ # Soft reminder about the workflow
11
+ cat << 'EOF'
12
+
13
+ TDD Workflow Reminder:
14
+ For each todo that requires implementation:
15
+ 1. /write-tests -> create failing tests first
16
+ 2. /implement -> make tests pass
17
+ 3. Mark todo complete
18
+
19
+ EOF
20
+
21
+ exit 0
.claude/hooks/install.sh ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Install git hooks for OpenEnv
3
+ #
4
+ # Usage: .claude/hooks/install.sh
5
+ #
6
+ # This installs pre-commit, pre-push, commit-msg, and post-merge hooks.
7
+
8
+ set -e
9
+
10
+ REPO_ROOT="$(git rev-parse --show-toplevel)"
11
+ # Use --git-common-dir to get the shared hooks directory (works in worktrees too)
12
+ GIT_COMMON_DIR="$(git rev-parse --git-common-dir)"
13
+ HOOKS_DIR="$GIT_COMMON_DIR/hooks"
14
+
15
+ # Create hooks directory if it doesn't exist
16
+ mkdir -p "$HOOKS_DIR"
17
+
18
+ echo "Installing git hooks..."
19
+
20
+ # Pre-commit hook: format, lint, branch check
21
+ cat > "$HOOKS_DIR/pre-commit" << 'EOF'
22
+ #!/bin/bash
23
+ # Installed by .claude/hooks/install.sh
24
+
25
+ echo "Running pre-commit checks..."
26
+
27
+ REPO_ROOT="$(git rev-parse --show-toplevel)"
28
+
29
+ # === Branch Check (BLOCKING) ===
30
+ echo ""
31
+ echo "=== Branch Check ==="
32
+ BRANCH=$(git rev-parse --abbrev-ref HEAD)
33
+ if [ "$BRANCH" = "main" ] || [ "$BRANCH" = "master" ]; then
34
+ echo "ERROR: Cannot commit directly to $BRANCH"
35
+ echo ""
36
+ echo "Create a worktree first:"
37
+ echo " $REPO_ROOT/.claude/scripts/worktree-create.sh <name>"
38
+ exit 1
39
+ fi
40
+ echo "On branch: $BRANCH"
41
+
42
+ # === Import Sort + Format Check ===
43
+ echo ""
44
+ echo "=== Import Sort + Format Check ==="
45
+ # Run the arc f pipeline: usort then ruff format
46
+ uv run usort format src/ tests/ >/dev/null 2>&1
47
+ uv run ruff format src/ tests/ >/dev/null 2>&1
48
+ CHANGED=$(git diff --name-only -- '*.py' 2>/dev/null || true)
49
+ if [ -n "$CHANGED" ]; then
50
+ echo "Files need formatting (usort + ruff format):"
51
+ echo "$CHANGED"
52
+ echo ""
53
+ echo "Auto-formatting and staging changes..."
54
+ git add $CHANGED
55
+ echo "Fixed! Changes staged."
56
+ else
57
+ echo "Import sort + format check passed!"
58
+ fi
59
+
60
+ # === Lint Check ===
61
+ echo ""
62
+ echo "=== Lint Check ==="
63
+ "$REPO_ROOT/.claude/hooks/lint.sh" || {
64
+ echo "Lint failed. Fix issues before committing."
65
+ exit 1
66
+ }
67
+
68
+ # === Debug Artifacts (non-blocking) ===
69
+ echo ""
70
+ echo "=== Debug Artifacts ==="
71
+ "$REPO_ROOT/.claude/hooks/check-debug.sh"
72
+
73
+ echo ""
74
+ echo "Pre-commit checks passed"
75
+ EOF
76
+ chmod +x "$HOOKS_DIR/pre-commit"
77
+ echo " Installed pre-commit hook"
78
+
79
+ # Commit-msg hook: require issue reference
80
+ cat > "$HOOKS_DIR/commit-msg" << 'EOF'
81
+ #!/bin/bash
82
+ # Installed by .claude/hooks/install.sh
83
+ # Require issue reference in commit message
84
+
85
+ COMMIT_MSG_FILE="$1"
86
+ COMMIT_MSG=$(cat "$COMMIT_MSG_FILE")
87
+
88
+ # Check for issue reference (#123, Fixes #123, Part of #123, etc.)
89
+ if echo "$COMMIT_MSG" | grep -qE '#[0-9]+'; then
90
+ exit 0
91
+ fi
92
+
93
+ # Allow WIP commits without issue reference
94
+ if echo "$COMMIT_MSG" | grep -qiE '^WIP'; then
95
+ exit 0
96
+ fi
97
+
98
+ echo ""
99
+ echo "WARNING: Commit message should reference an issue (#123)"
100
+ echo " Examples: 'Fix bug in parser #45'"
101
+ echo " 'Fixes #123'"
102
+ echo " 'Part of #99'"
103
+ echo ""
104
+ echo "Proceeding anyway (this is a soft warning)..."
105
+ exit 0
106
+ EOF
107
+ chmod +x "$HOOKS_DIR/commit-msg"
108
+ echo " Installed commit-msg hook"
109
+
110
+ # Pre-push hook: comprehensive validation
111
+ cat > "$HOOKS_DIR/pre-push" << 'EOF'
112
+ #!/bin/bash
113
+ # Installed by .claude/hooks/install.sh
114
+ # Comprehensive pre-push validation
115
+
116
+ echo "Running pre-push checks..."
117
+
118
+ REPO_ROOT="$(git rev-parse --show-toplevel)"
119
+ FAILED=0
120
+
121
+ # 0. BLOCK PUSHES TO MAIN/MASTER (most critical check)
122
+ echo ""
123
+ echo "=== Protected Branch Check ==="
124
+ # Read the remote and refs being pushed from stdin
125
+ while read local_ref local_sha remote_ref remote_sha; do
126
+ # Extract branch name from remote ref (refs/heads/main -> main)
127
+ remote_branch="${remote_ref#refs/heads/}"
128
+
129
+ if [ "$remote_branch" = "main" ] || [ "$remote_branch" = "master" ]; then
130
+ echo "ERROR: Direct push to '$remote_branch' is blocked!"
131
+ echo ""
132
+ echo " You are trying to push to a protected branch."
133
+ echo " Create a PR instead:"
134
+ echo ""
135
+ echo " # Push to a feature branch"
136
+ echo " git push -u origin HEAD:feature/your-branch-name"
137
+ echo ""
138
+ echo " # Then create a PR"
139
+ echo " gh pr create"
140
+ echo ""
141
+ echo " To bypass (not recommended): git push --no-verify"
142
+ exit 1
143
+ fi
144
+ done
145
+ echo "Not pushing to protected branch - OK"
146
+
147
+ # 1. Import sort + format check
148
+ echo ""
149
+ echo "=== Import Sort + Format Check ==="
150
+ uv run usort format src/ tests/ >/dev/null 2>&1
151
+ uv run ruff format src/ tests/ >/dev/null 2>&1
152
+ CHANGED_FMT=$(git diff --name-only -- '*.py' 2>/dev/null || true)
153
+ if [ -n "$CHANGED_FMT" ]; then
154
+ echo "Files not properly formatted:"
155
+ echo "$CHANGED_FMT"
156
+ echo ""
157
+ echo "Run: uv run usort format src/ tests/ && uv run ruff format src/ tests/"
158
+ git checkout -- $CHANGED_FMT 2>/dev/null || true
159
+ FAILED=1
160
+ fi
161
+
162
+ # 2. Lint check
163
+ echo ""
164
+ echo "=== Lint Check ==="
165
+ "$REPO_ROOT/.claude/hooks/lint.sh" || {
166
+ echo "Lint failed"
167
+ FAILED=1
168
+ }
169
+
170
+ # 3. Test check
171
+ echo ""
172
+ echo "=== Test Check ==="
173
+ "$REPO_ROOT/.claude/hooks/test.sh" || {
174
+ echo "Tests failed"
175
+ FAILED=1
176
+ }
177
+
178
+ # 4. Debug artifacts
179
+ echo ""
180
+ echo "=== Debug Artifacts ==="
181
+ "$REPO_ROOT/.claude/hooks/check-debug.sh"
182
+
183
+ # 5. Invariant: Client should not import from server
184
+ echo ""
185
+ echo "=== Invariant Checks ==="
186
+ # Check if any client file imports from server directory
187
+ # Pattern matches actual imports: "from .server", "from ..server", "import server"
188
+ # Excludes comments and string literals mentioning "server"
189
+ VIOLATIONS=$(grep -rE "^[[:space:]]*(from [.]+server|import server)" --include="*.py" envs/*/client.py envs/*/__init__.py 2>/dev/null | grep -v "# noqa" || true)
190
+ if [ -n "$VIOLATIONS" ]; then
191
+ echo "INVARIANT VIOLATION: Client imports from server"
192
+ echo "$VIOLATIONS"
193
+ echo ""
194
+ echo " Client code must not import server code. Check INVARIANTS.md."
195
+ echo " Add '# noqa' comment to suppress if this is intentional (e.g., for local testing)."
196
+ # Note: This is a warning for now due to pre-existing violations
197
+ # TODO: Make this blocking once all violations are fixed (issue #XXX)
198
+ echo " (Currently warning-only - see pre-existing violations)"
199
+ else
200
+ echo "Client-server separation maintained"
201
+ fi
202
+
203
+ # 6. Check branch freshness with main (warning only, non-blocking)
204
+ echo ""
205
+ echo "=== Branch Freshness Check ==="
206
+ # Fetch latest main silently
207
+ git fetch origin main --quiet 2>/dev/null || true
208
+
209
+ # Check how many commits behind main we are
210
+ BEHIND_COUNT=$(git rev-list --count HEAD..origin/main 2>/dev/null || echo "0")
211
+ if [ "$BEHIND_COUNT" -gt 0 ]; then
212
+ echo "WARNING: Your branch is $BEHIND_COUNT commit(s) behind main!"
213
+ echo ""
214
+ echo " GitHub will show 'This branch is out-of-date with the base branch'"
215
+ echo ""
216
+ echo " To update before pushing:"
217
+ echo " git fetch origin main"
218
+ echo " git merge origin/main"
219
+ echo " git push"
220
+ echo ""
221
+ echo " Pushing anyway (update before merging PR)"
222
+ else
223
+ echo "Branch is up to date with main"
224
+ fi
225
+
226
+ # 7. Check for conflicts with main (warning only, non-blocking)
227
+ echo ""
228
+ echo "=== Conflict Check with main ==="
229
+ # Try a test merge to detect conflicts (then abort)
230
+ MERGE_OUTPUT=$(git merge --no-commit --no-ff origin/main 2>&1) || true
231
+ MERGE_EXIT=$?
232
+ git merge --abort 2>/dev/null || true
233
+
234
+ if echo "$MERGE_OUTPUT" | grep -q "CONFLICT"; then
235
+ echo "WARNING: Your branch has conflicts with main!"
236
+ echo ""
237
+ echo "$MERGE_OUTPUT" | grep "CONFLICT" | head -5
238
+ echo ""
239
+ echo " To resolve before PR review:"
240
+ echo " git fetch origin main"
241
+ echo " git merge origin/main"
242
+ echo " # resolve conflicts"
243
+ echo " git push"
244
+ echo ""
245
+ echo " Pushing anyway (fix conflicts before merging PR)"
246
+ else
247
+ echo "No conflicts with main detected"
248
+ fi
249
+
250
+ # Summary
251
+ echo ""
252
+ if [ $FAILED -eq 1 ]; then
253
+ echo "Pre-push checks FAILED. Fix issues before pushing."
254
+ exit 1
255
+ else
256
+ echo "Pre-push checks passed"
257
+ fi
258
+ EOF
259
+ chmod +x "$HOOKS_DIR/pre-push"
260
+ echo " Installed pre-push hook"
261
+
262
+ # Post-merge hook: remind about worktree cleanup
263
+ cat > "$HOOKS_DIR/post-merge" << 'EOF'
264
+ #!/bin/bash
265
+ # Installed by .claude/hooks/install.sh
266
+ # Remind about worktree cleanup after merge
267
+
268
+ echo ""
269
+ echo "=== Post-Merge Reminder ==="
270
+
271
+ # Check if we're in a worktree
272
+ TOPLEVEL=$(git rev-parse --show-toplevel 2>/dev/null)
273
+ if [ -f "$TOPLEVEL/.git" ]; then
274
+ echo "You're in a worktree: $TOPLEVEL"
275
+ echo ""
276
+ echo "If this PR is complete, clean up with:"
277
+ echo " .claude/scripts/worktree-cleanup.sh $TOPLEVEL"
278
+ fi
279
+ EOF
280
+ chmod +x "$HOOKS_DIR/post-merge"
281
+ echo " Installed post-merge hook"
282
+
283
+ echo ""
284
+ echo "Git hooks installed successfully!"
285
+ echo ""
286
+ echo "Hooks installed:"
287
+ echo " - pre-commit: branch check, usort+format, lint, check-debug"
288
+ echo " - commit-msg: issue reference reminder (soft warning)"
289
+ echo " - pre-push: usort+format, lint, tests, check-debug, invariant checks, conflict detection"
290
+ echo " - post-merge: worktree cleanup reminder"
291
+ echo ""
292
+ echo "To skip hooks temporarily: git commit/push --no-verify"
.claude/hooks/lint.sh ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Lint check for OpenEnv
3
+ # Replicates the exact arc f pipeline from fbsource:
4
+ # 1. usort format — sort imports (matches arc f's usort pass)
5
+ # 2. ruff format — code formatting, line-length 88 (matches arc f's ruff-api pass)
6
+ # 3. ruff check — lint rules (E, F, W)
7
+ #
8
+ # usort is scoped to src/ and tests/ only. envs/ uses ruff format only
9
+ # because standalone usort and pyfmt's usort disagree on import ordering
10
+ # inside try/except blocks in some env files.
11
+
12
+ set -e
13
+
14
+ # Check for required tools
15
+ if ! command -v uv &> /dev/null; then
16
+ echo "Error: 'uv' is not installed or not in PATH"
17
+ echo "Install with: curl -LsSf https://astral.sh/uv/install.sh | sh"
18
+ exit 1
19
+ fi
20
+
21
+ echo "=== Running import sort + format check ==="
22
+ # Run the same pipeline as arc f: usort then ruff format.
23
+ # If any file changes, the code wasn't properly formatted.
24
+ uv run usort format src/ tests/ >/dev/null 2>&1
25
+ uv run ruff format src/ tests/ envs/ >/dev/null 2>&1
26
+
27
+ # Check if any files were modified (means they weren't formatted before)
28
+ CHANGED=$(git diff --name-only -- '*.py' 2>/dev/null || true)
29
+ if [ -n "$CHANGED" ]; then
30
+ echo "ERROR: The following files need formatting:"
31
+ echo "$CHANGED"
32
+ echo ""
33
+ echo "Run: uv run usort format src/ tests/ && uv run ruff format src/ tests/ envs/"
34
+ # Undo the formatting so the working tree stays as-is
35
+ git checkout -- $CHANGED 2>/dev/null || true
36
+ exit 1
37
+ fi
38
+ echo "Import sort + format check passed!"
39
+
40
+ echo "=== Running lint rules check ==="
41
+ uv run ruff check src/ tests/
42
+
43
+ echo "=== Lint check passed ==="
.claude/hooks/no-direct-code.sh ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # PreToolUse hook for Edit/Write: Block direct code edits in TDD mode
3
+ #
4
+ # Design: Only block when TDD is activated via /work-on-issue.
5
+ # Worktrees without TDD marker and the main repo allow direct edits.
6
+
7
+ # Check if TDD is active (marker file from /work-on-issue)
8
+ source "$(dirname "$0")/tdd-state.sh"
9
+ if ! is_tdd_active; then
10
+ exit 0 # TDD not active, allow all edits
11
+ fi
12
+
13
+ # Read JSON from stdin (hook input format)
14
+ INPUT=$(cat)
15
+ FILE_PATH=$(echo "$INPUT" | jq -r '.tool_input.file_path // empty' 2>/dev/null)
16
+
17
+ # If no file path or jq failed, allow
18
+ if [[ -z "$FILE_PATH" ]]; then
19
+ exit 0
20
+ fi
21
+
22
+ # Only check Python implementation files
23
+ if [[ "$FILE_PATH" != *.py ]]; then
24
+ exit 0 # Not a Python file, allow
25
+ fi
26
+
27
+ # Allow test files
28
+ if [[ "$FILE_PATH" == *test* ]] || [[ "$FILE_PATH" == */tests/* ]]; then
29
+ exit 0 # Test file, allow (tester persona can write these)
30
+ fi
31
+
32
+ # Allow non-src files (scripts, configs, etc.)
33
+ if [[ "$FILE_PATH" != */src/* ]] && [[ "$FILE_PATH" != */envs/* ]]; then
34
+ exit 0
35
+ fi
36
+
37
+ # Block with helpful message
38
+ ISSUE=$(get_tdd_issue)
39
+ cat >&2 << EOF
40
+
41
+ ===================================================================
42
+ TDD MODE: Direct code edit blocked (issue #${ISSUE:-?})
43
+ ===================================================================
44
+
45
+ In TDD mode, use the TDD workflow:
46
+
47
+ 1. /write-tests -> tester writes failing tests
48
+ 2. /implement -> implementer makes tests pass
49
+
50
+ To bypass this check, say "skip TDD" in your message.
51
+
52
+ ===================================================================
53
+
54
+ EOF
55
+
56
+ exit 2
.claude/hooks/post-push-pr.sh ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Post-push PR validation. Run after `gh pr create` or `git push` to verify
3
+ # the PR looks good on GitHub.
4
+ #
5
+ # Usage: bash .claude/hooks/post-push-pr.sh [PR_NUMBER]
6
+ #
7
+ # If PR_NUMBER is omitted, uses the PR for the current branch.
8
+
9
+ set -e
10
+
11
+ REPO_ROOT="$(git rev-parse --show-toplevel)"
12
+ PR_NUMBER="${1:-}"
13
+ FAILED=0
14
+
15
+ echo ""
16
+ echo "==================================================================="
17
+ echo " Post-Push PR Checks"
18
+ echo "==================================================================="
19
+ echo ""
20
+
21
+ # Resolve PR number from current branch if not provided
22
+ if [[ -z "$PR_NUMBER" ]]; then
23
+ PR_NUMBER=$(gh pr view --json number -q '.number' 2>/dev/null || true)
24
+ if [[ -z "$PR_NUMBER" ]]; then
25
+ echo "ERROR: No PR found for current branch."
26
+ echo " Create one with: gh pr create"
27
+ exit 1
28
+ fi
29
+ fi
30
+
31
+ echo "Checking PR #$PR_NUMBER..."
32
+ echo ""
33
+
34
+ # Fetch PR details in one call
35
+ PR_JSON=$(gh pr view "$PR_NUMBER" --json state,mergeable,baseRefName,headRefName,title,body,statusCheckRollup,commits 2>/dev/null)
36
+ if [[ -z "$PR_JSON" ]]; then
37
+ echo "ERROR: Could not fetch PR #$PR_NUMBER"
38
+ exit 1
39
+ fi
40
+
41
+ PR_STATE=$(echo "$PR_JSON" | jq -r '.state')
42
+ PR_MERGEABLE=$(echo "$PR_JSON" | jq -r '.mergeable')
43
+ PR_BASE=$(echo "$PR_JSON" | jq -r '.baseRefName')
44
+ PR_HEAD=$(echo "$PR_JSON" | jq -r '.headRefName')
45
+ PR_TITLE=$(echo "$PR_JSON" | jq -r '.title')
46
+ PR_BODY=$(echo "$PR_JSON" | jq -r '.body')
47
+ COMMIT_COUNT=$(echo "$PR_JSON" | jq '.commits | length')
48
+
49
+ # 1. PR is open
50
+ echo "=== PR State ==="
51
+ if [[ "$PR_STATE" == "OPEN" ]]; then
52
+ echo "PASS: PR is open"
53
+ else
54
+ echo "FAIL: PR state is '$PR_STATE'"
55
+ FAILED=1
56
+ fi
57
+
58
+ # 2. Mergeable (no conflicts)
59
+ echo ""
60
+ echo "=== Merge Conflicts ==="
61
+ if [[ "$PR_MERGEABLE" == "MERGEABLE" ]]; then
62
+ echo "PASS: No merge conflicts with $PR_BASE"
63
+ elif [[ "$PR_MERGEABLE" == "UNKNOWN" ]]; then
64
+ echo "WARN: Mergeability not yet computed (check again shortly)"
65
+ else
66
+ echo "FAIL: PR has merge conflicts with $PR_BASE"
67
+ echo " Rebase onto $PR_BASE to fix:"
68
+ echo " git fetch origin $PR_BASE"
69
+ echo " git rebase origin/$PR_BASE"
70
+ echo " git push --force-with-lease"
71
+ FAILED=1
72
+ fi
73
+
74
+ # 3. Branch freshness (commits behind base)
75
+ echo ""
76
+ echo "=== Branch Freshness ==="
77
+ git fetch origin "$PR_BASE" --quiet 2>/dev/null || true
78
+ BEHIND=$(git rev-list --count HEAD.."origin/$PR_BASE" 2>/dev/null || echo "?")
79
+ if [[ "$BEHIND" == "0" ]]; then
80
+ echo "PASS: Branch is up to date with $PR_BASE"
81
+ elif [[ "$BEHIND" == "?" ]]; then
82
+ echo "WARN: Could not determine freshness"
83
+ else
84
+ echo "FAIL: Branch is $BEHIND commit(s) behind $PR_BASE"
85
+ echo " Rebase to fix:"
86
+ echo " git rebase origin/$PR_BASE"
87
+ echo " git push --force-with-lease"
88
+ FAILED=1
89
+ fi
90
+
91
+ # 4. PR description
92
+ echo ""
93
+ echo "=== PR Description ==="
94
+ BODY_LEN=${#PR_BODY}
95
+ if [[ "$BODY_LEN" -lt 50 ]]; then
96
+ echo "WARN: PR description is very short ($BODY_LEN chars)"
97
+ echo " Consider adding a summary, change list, and test plan"
98
+ else
99
+ echo "PASS: PR description present ($BODY_LEN chars)"
100
+ fi
101
+
102
+ # Check for test plan
103
+ if echo "$PR_BODY" | grep -qi "test plan"; then
104
+ echo "PASS: Test plan section found"
105
+ else
106
+ echo "WARN: No 'Test plan' section in PR description"
107
+ fi
108
+
109
+ # 5. CI status
110
+ echo ""
111
+ echo "=== CI Checks ==="
112
+ CHECK_COUNT=$(echo "$PR_JSON" | jq '.statusCheckRollup | length' 2>/dev/null || echo "0")
113
+ if [[ "$CHECK_COUNT" -gt 0 ]]; then
114
+ PENDING=$(echo "$PR_JSON" | jq '[.statusCheckRollup[] | select(.status != "COMPLETED")] | length')
115
+ FAILED_CHECKS=$(echo "$PR_JSON" | jq '[.statusCheckRollup[] | select(.conclusion == "FAILURE")] | length')
116
+ PASSED_CHECKS=$(echo "$PR_JSON" | jq '[.statusCheckRollup[] | select(.conclusion == "SUCCESS")] | length')
117
+
118
+ echo "$PASSED_CHECKS passed, $FAILED_CHECKS failed, $PENDING pending (of $CHECK_COUNT total)"
119
+
120
+ if [[ "$FAILED_CHECKS" -gt 0 ]]; then
121
+ echo ""
122
+ echo "Failed checks:"
123
+ echo "$PR_JSON" | jq -r '.statusCheckRollup[] | select(.conclusion == "FAILURE") | " - \(.name)"'
124
+ FAILED=1
125
+ fi
126
+ if [[ "$PENDING" -gt 0 ]]; then
127
+ echo ""
128
+ echo "Pending checks (re-run this script after they complete):"
129
+ echo "$PR_JSON" | jq -r '.statusCheckRollup[] | select(.status != "COMPLETED") | " - \(.name): \(.status)"'
130
+ fi
131
+ else
132
+ echo "WARN: No CI checks found (may still be starting)"
133
+ fi
134
+
135
+ # 6. Commit count
136
+ echo ""
137
+ echo "=== Commits ==="
138
+ echo "$COMMIT_COUNT commit(s) in this PR"
139
+
140
+ # Summary
141
+ echo ""
142
+ echo "==================================================================="
143
+ if [[ $FAILED -eq 1 ]]; then
144
+ echo " ISSUES FOUND — fix before requesting review"
145
+ else
146
+ echo " ALL CHECKS PASSED — ready for review"
147
+ fi
148
+ echo "==================================================================="
149
+ echo ""
150
+ echo " PR: https://github.com/$(gh repo view --json nameWithOwner -q .nameWithOwner)/pull/$PR_NUMBER"
151
+ echo ""
152
+
153
+ exit $FAILED
.claude/hooks/pre-commit-check.sh ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # PreToolUse hook for Bash: Warn on git commit without /pre-submit-pr
3
+
4
+ # Read JSON from stdin
5
+ INPUT=$(cat)
6
+ COMMAND=$(echo "$INPUT" | jq -r '.tool_input.command // empty' 2>/dev/null)
7
+
8
+ # Only check git commit commands
9
+ if [[ "$COMMAND" != *"git commit"* ]]; then
10
+ exit 0
11
+ fi
12
+
13
+ # Only warn when TDD is active
14
+ source "$(dirname "$0")/tdd-state.sh"
15
+ if ! is_tdd_active; then
16
+ exit 0 # TDD not active, just allow
17
+ fi
18
+
19
+ # Soft warning - don't block, just remind
20
+ cat >&2 << 'EOF'
21
+
22
+ ===================================================================
23
+ REMINDER: Consider running /pre-submit-pr before committing
24
+ ===================================================================
25
+
26
+ This ensures:
27
+ - Lint check passes
28
+ - Tests pass
29
+ - No debug code left in
30
+ - Alignment with principles
31
+
32
+ Proceeding with commit...
33
+
34
+ ===================================================================
35
+
36
+ EOF
37
+
38
+ exit 0
.claude/hooks/pre-pr-check.sh ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # PreToolUse hook for Bash: Block PR creation if branch is stale
3
+ #
4
+ # Intercepts `gh pr create` and checks branch freshness against the
5
+ # base branch. Unlike git hooks, this cannot be bypassed with --no-verify.
6
+
7
+ # Read JSON from stdin
8
+ INPUT=$(cat)
9
+ COMMAND=$(echo "$INPUT" | jq -r '.tool_input.command // empty' 2>/dev/null)
10
+
11
+ # Only check gh pr create commands
12
+ if [[ "$COMMAND" != *"gh pr create"* ]]; then
13
+ exit 0
14
+ fi
15
+
16
+ # Determine base branch (default: main)
17
+ BASE="main"
18
+ if echo "$COMMAND" | grep -qoP '(?<=--base\s)\S+'; then
19
+ BASE=$(echo "$COMMAND" | grep -oP '(?<=--base\s)\S+')
20
+ fi
21
+
22
+ # Fetch latest base and check freshness
23
+ git fetch origin "$BASE" --quiet 2>/dev/null || true
24
+ BEHIND=$(git rev-list --count HEAD.."origin/$BASE" 2>/dev/null || echo "?")
25
+
26
+ if [[ "$BEHIND" != "0" && "$BEHIND" != "?" ]]; then
27
+ cat >&2 << EOF
28
+
29
+ ===================================================================
30
+ PR BLOCKED: Branch is $BEHIND commit(s) behind $BASE
31
+ ===================================================================
32
+
33
+ Your PR will show "out of date with base branch" on GitHub.
34
+
35
+ Fix with:
36
+ git fetch origin $BASE
37
+ git rebase origin/$BASE
38
+ git push --force-with-lease
39
+
40
+ Then retry gh pr create.
41
+
42
+ ===================================================================
43
+
44
+ EOF
45
+ exit 2
46
+ fi
47
+
48
+ # Check we're not on main/master
49
+ BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null)
50
+ if [[ "$BRANCH" == "main" || "$BRANCH" == "master" ]]; then
51
+ cat >&2 << EOF
52
+
53
+ ===================================================================
54
+ PR BLOCKED: Cannot create PR from $BRANCH
55
+ ===================================================================
56
+
57
+ Create a feature branch first:
58
+ git checkout -b <branch-name>
59
+ git push -u origin <branch-name>
60
+
61
+ ===================================================================
62
+
63
+ EOF
64
+ exit 2
65
+ fi
66
+
67
+ exit 0
.claude/hooks/session-start.sh ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # SessionStart hook: Show context and set mode based on TDD state
3
+
4
+ echo ""
5
+
6
+ # Check if we're in a git repo
7
+ if ! git rev-parse --is-inside-work-tree &>/dev/null; then
8
+ exit 0
9
+ fi
10
+
11
+ TOPLEVEL=$(git rev-parse --show-toplevel)
12
+
13
+ # Source TDD state helpers
14
+ source "$(dirname "$0")/tdd-state.sh"
15
+
16
+ if is_tdd_active; then
17
+ # TDD mode activated via /work-on-issue
18
+ ISSUE=$(get_tdd_issue)
19
+ FEATURE=$(basename "$TOPLEVEL")
20
+ BRANCH=$(git branch --show-current 2>/dev/null)
21
+
22
+ echo "==================================================================="
23
+ echo " TDD MODE ACTIVE (issue #${ISSUE:-?})"
24
+ echo "==================================================================="
25
+ echo " Worktree: $FEATURE"
26
+ echo " Branch: $BRANCH"
27
+ echo ""
28
+ echo " Direct code edits blocked."
29
+ echo ""
30
+ echo " Workflow:"
31
+ echo " /write-tests -> create failing tests"
32
+ echo " /implement -> make tests pass"
33
+ echo " /update-docs -> fix stale docs"
34
+ echo " /simplify -> clean up (optional)"
35
+ echo " /pre-submit-pr -> validate before commit"
36
+ echo ""
37
+ echo " Say \"skip TDD\" to bypass blocking"
38
+ echo "==================================================================="
39
+ elif [[ "$TOPLEVEL" == *".worktrees"* ]]; then
40
+ # In a worktree but TDD not activated
41
+ FEATURE=$(basename "$TOPLEVEL")
42
+ BRANCH=$(git branch --show-current 2>/dev/null)
43
+
44
+ echo "==================================================================="
45
+ echo " WORKTREE: $FEATURE"
46
+ echo "==================================================================="
47
+ echo " Branch: $BRANCH"
48
+ echo ""
49
+ echo " Direct edits allowed. To enable TDD enforcement:"
50
+ echo " /work-on-issue #<N> -> start TDD workflow"
51
+ echo "==================================================================="
52
+ else
53
+ echo "==================================================================="
54
+ echo " MAIN REPO (Explore Mode)"
55
+ echo "==================================================================="
56
+ echo ""
57
+ echo " Direct edits allowed. For focused work:"
58
+ echo " /work-on-issue #42 -> start TDD workflow"
59
+ echo ""
60
+ echo " Or manually:"
61
+ echo " .claude/scripts/worktree-create.sh <name>"
62
+ echo "==================================================================="
63
+ fi
64
+
65
+ echo ""
.claude/hooks/tdd-deactivate.sh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Standalone script to deactivate TDD enforcement.
3
+ # Usage: bash .claude/hooks/tdd-deactivate.sh
4
+
5
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
6
+ bash "$SCRIPT_DIR/tdd-state.sh" deactivate
.claude/hooks/tdd-state.sh ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Shared TDD state helpers.
3
+ #
4
+ # Can be used two ways:
5
+ # 1. Sourced: source tdd-state.sh && is_tdd_active
6
+ # 2. Direct: bash tdd-state.sh activate 42
7
+ #
8
+ # TDD is activated by /work-on-issue, which writes .tdd-session.json
9
+ # to the worktree root. All hooks check this file instead of the
10
+ # .worktrees path, making TDD opt-in.
11
+
12
+ _tdd_toplevel() {
13
+ git rev-parse --show-toplevel 2>/dev/null
14
+ }
15
+
16
+ is_tdd_active() {
17
+ local toplevel
18
+ toplevel=$(_tdd_toplevel) || return 1
19
+ [[ -f "$toplevel/.tdd-session.json" ]]
20
+ }
21
+
22
+ get_tdd_issue() {
23
+ local toplevel
24
+ toplevel=$(_tdd_toplevel) || return 1
25
+ jq -r '.issue // empty' "$toplevel/.tdd-session.json" 2>/dev/null
26
+ }
27
+
28
+ activate_tdd() {
29
+ local issue="$1"
30
+ if [[ -z "$issue" ]]; then
31
+ echo "Usage: activate_tdd <issue-number>" >&2
32
+ return 1
33
+ fi
34
+ local toplevel
35
+ toplevel=$(_tdd_toplevel) || return 1
36
+ local branch
37
+ branch=$(git branch --show-current 2>/dev/null)
38
+
39
+ jq -n \
40
+ --arg issue "$issue" \
41
+ --arg branch "$branch" \
42
+ --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
43
+ '{issue: $issue, branch: $branch, activated_at: $ts}' \
44
+ > "$toplevel/.tdd-session.json"
45
+
46
+ echo "TDD enforcement activated for issue #$issue"
47
+ }
48
+
49
+ deactivate_tdd() {
50
+ local toplevel
51
+ toplevel=$(_tdd_toplevel) || return 1
52
+ if [[ -f "$toplevel/.tdd-session.json" ]]; then
53
+ rm "$toplevel/.tdd-session.json"
54
+ echo "TDD enforcement deactivated"
55
+ else
56
+ echo "TDD was not active"
57
+ fi
58
+ }
59
+
60
+ # When executed directly (not sourced), dispatch subcommands
61
+ if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
62
+ case "${1:-}" in
63
+ activate) activate_tdd "$2" ;;
64
+ deactivate) deactivate_tdd ;;
65
+ active) is_tdd_active ;;
66
+ issue) get_tdd_issue ;;
67
+ *)
68
+ echo "Usage: bash $0 {activate <issue>|deactivate|active|issue}" >&2
69
+ exit 1
70
+ ;;
71
+ esac
72
+ fi
.claude/hooks/test.sh ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Test runner for OpenEnv
3
+ # Runs pytest excluding environments that need special setup
4
+
5
+ set -e
6
+
7
+ # Check for required tools
8
+ if ! command -v uv &> /dev/null; then
9
+ echo "Error: 'uv' is not installed or not in PATH"
10
+ echo "Install with: curl -LsSf https://astral.sh/uv/install.sh | sh"
11
+ exit 1
12
+ fi
13
+
14
+ echo "=== Running tests ==="
15
+ # Note: Using timeout to prevent hanging tests from blocking indefinitely (5 min max)
16
+ # Matches .github/workflows/test.yml exactly to catch CI failures before push
17
+ PYTHONPATH=src:envs timeout 300 uv run pytest tests/ \
18
+ --ignore=tests/envs/test_browsergym_environment.py \
19
+ --ignore=tests/envs/test_dipg_environment.py \
20
+ --ignore=tests/envs/test_websearch_environment.py \
21
+ --ignore=tests/envs/test_python_codeact_reset.py \
22
+ --ignore=tests/envs/test_python_codeact_rewards.py \
23
+ --ignore=tests/envs/test_textarena_environment.py \
24
+ -m "not integration and not network and not docker" \
25
+ -v \
26
+ --tb=short
27
+
28
+ TEST_EXIT_CODE=$?
29
+ if [ $TEST_EXIT_CODE -eq 124 ]; then
30
+ echo "ERROR: Tests timed out after 5 minutes"
31
+ exit 1
32
+ elif [ $TEST_EXIT_CODE -ne 0 ]; then
33
+ echo "=== Tests failed ==="
34
+ exit $TEST_EXIT_CODE
35
+ fi
36
+
37
+ echo "=== Tests completed ==="
.claude/scripts/worktree-cleanup.sh ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Clean up a git worktree after PR is merged
3
+ set -e
4
+
5
+ if [ -z "$1" ]; then
6
+ echo "Usage: $0 <worktree-path>"
7
+ echo ""
8
+ echo "Example: $0 .worktrees/add-auth"
9
+ echo "Removes the worktree and optionally deletes the branch"
10
+ exit 1
11
+ fi
12
+
13
+ WORKTREE_PATH="$1"
14
+
15
+ # Verify it's a valid worktree
16
+ if [ ! -d "$WORKTREE_PATH" ]; then
17
+ echo "ERROR: Directory does not exist: $WORKTREE_PATH"
18
+ exit 1
19
+ fi
20
+
21
+ if [ ! -f "$WORKTREE_PATH/.git" ]; then
22
+ echo "ERROR: Not a git worktree: $WORKTREE_PATH"
23
+ exit 1
24
+ fi
25
+
26
+ # Get the branch name
27
+ cd "$WORKTREE_PATH"
28
+ BRANCH=$(git branch --show-current)
29
+ cd - > /dev/null
30
+
31
+ echo "Removing worktree: $WORKTREE_PATH"
32
+ echo "Branch: $BRANCH"
33
+ echo ""
34
+
35
+ # Remove the worktree
36
+ git worktree remove "$WORKTREE_PATH" --force
37
+
38
+ echo "Worktree removed."
39
+ echo ""
40
+
41
+ # Ask about branch deletion
42
+ read -p "Delete branch '$BRANCH'? (y/N) " -n 1 -r
43
+ echo ""
44
+
45
+ if [[ $REPLY =~ ^[Yy]$ ]]; then
46
+ git branch -D "$BRANCH"
47
+ echo "Branch deleted."
48
+ else
49
+ echo "Branch kept. Delete manually with: git branch -D $BRANCH"
50
+ fi
51
+
52
+ echo ""
53
+ echo "Cleanup complete!"
.claude/scripts/worktree-create.sh ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Create a git worktree for a new feature branch
3
+ set -e
4
+
5
+ if [ -z "$1" ]; then
6
+ echo "Usage: $0 <branch-name>"
7
+ echo ""
8
+ echo "Example: $0 add-auth"
9
+ echo "Creates: .worktrees/add-auth with branch feature/add-auth"
10
+ exit 1
11
+ fi
12
+
13
+ BRANCH_NAME="$1"
14
+ FEATURE_BRANCH="feature/$BRANCH_NAME"
15
+
16
+ # Get repo root
17
+ REPO_ROOT=$(git rev-parse --show-toplevel)
18
+
19
+ # Worktree path is inside .worktrees/ subdirectory
20
+ WORKTREE_PATH="$REPO_ROOT/.worktrees/$BRANCH_NAME"
21
+
22
+ # Ensure .worktrees directory exists
23
+ mkdir -p "$REPO_ROOT/.worktrees"
24
+
25
+ # Check if worktree already exists
26
+ if [ -d "$WORKTREE_PATH" ]; then
27
+ echo "ERROR: Worktree already exists at $WORKTREE_PATH"
28
+ exit 1
29
+ fi
30
+
31
+ # Check if branch already exists
32
+ if git show-ref --verify --quiet "refs/heads/$FEATURE_BRANCH"; then
33
+ echo "Branch $FEATURE_BRANCH already exists, using existing branch"
34
+ git worktree add "$WORKTREE_PATH" "$FEATURE_BRANCH"
35
+ else
36
+ echo "Creating new branch $FEATURE_BRANCH"
37
+ git worktree add -b "$FEATURE_BRANCH" "$WORKTREE_PATH"
38
+ fi
39
+
40
+ echo ""
41
+ echo "Worktree created successfully!"
42
+ echo ""
43
+ echo "Path: $WORKTREE_PATH"
44
+ echo "Branch: $FEATURE_BRANCH"
45
+ echo ""
46
+ echo "To start working:"
47
+ echo " cd .worktrees/$BRANCH_NAME"
.claude/settings.json ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(gh auth status:*)"
5
+ ]
6
+ },
7
+ "hooks": {
8
+ "SessionStart": [
9
+ {
10
+ "hooks": [
11
+ {
12
+ "type": "command",
13
+ "command": ".claude/hooks/session-start.sh"
14
+ }
15
+ ]
16
+ }
17
+ ],
18
+ "PreToolUse": [
19
+ {
20
+ "matcher": "Bash",
21
+ "hooks": [
22
+ {
23
+ "type": "command",
24
+ "command": ".claude/hooks/pre-commit-check.sh"
25
+ },
26
+ {
27
+ "type": "command",
28
+ "command": ".claude/hooks/pre-pr-check.sh"
29
+ }
30
+ ]
31
+ },
32
+ {
33
+ "matcher": "Edit|Write",
34
+ "hooks": [
35
+ {
36
+ "type": "command",
37
+ "command": ".claude/hooks/no-direct-code.sh"
38
+ }
39
+ ]
40
+ }
41
+ ],
42
+ "PostToolUse": [
43
+ {
44
+ "matcher": "TodoWrite",
45
+ "hooks": [
46
+ {
47
+ "type": "command",
48
+ "command": ".claude/hooks/delegate-todos.sh"
49
+ }
50
+ ]
51
+ }
52
+ ],
53
+ "Stop": [
54
+ {
55
+ "hooks": [
56
+ {
57
+ "type": "prompt",
58
+ "prompt": "First, perform quick checks to avoid unnecessary TDD evaluation:\n\n0. TDD CONTEXT CHECK: Look at the session start output. If there is NO 'TDD MODE ACTIVE' banner and the session was not initiated via /work-on-issue, return 'stop' immediately. TDD enforcement only applies when explicitly activated.\n\n1. SKIP CHECK: If the user's message contains phrases like 'skip TDD', 'no TDD', 'just discussing', 'exploration only', or similar opt-out language, return 'stop' immediately.\n\n2. EDIT CHECK: Look at Claude's actions in this turn. Did Claude edit any implementation files (*.py files in src/ or envs/)? If NO implementation files were edited, return 'stop' immediately.\n\n3. TDD EVALUATION: Only if implementation files were edited AND no opt-out phrase was used, evaluate TDD compliance: (a) Did tests for the edited functionality exist first? (b) If starting new work, were requirements gathered and tests written before implementing? (c) If creating a PR or commit, is it linked to a GitHub issue?\n\nReturn 'continue' with corrective instructions if TDD was violated. Return 'stop' if workflow was followed or checks 0-2 passed."
59
+ }
60
+ ]
61
+ }
62
+ ],
63
+ "SubagentStop": [
64
+ {
65
+ "matcher": "tester",
66
+ "hooks": [
67
+ {
68
+ "type": "command",
69
+ "command": ".claude/hooks/after-tester.sh"
70
+ }
71
+ ]
72
+ },
73
+ {
74
+ "matcher": "implementer",
75
+ "hooks": [
76
+ {
77
+ "type": "command",
78
+ "command": ".claude/hooks/after-implementer.sh"
79
+ }
80
+ ]
81
+ },
82
+ {
83
+ "matcher": "docs-updater",
84
+ "hooks": [
85
+ {
86
+ "type": "command",
87
+ "command": ".claude/hooks/after-docs-updater.sh"
88
+ }
89
+ ]
90
+ },
91
+ {
92
+ "hooks": [
93
+ {
94
+ "type": "prompt",
95
+ "prompt": "Evaluate if the subagent completed its task successfully. For issue-worker: did it extract actionable requirements and acceptance criteria? For tester: did it produce tests with clear assertions? For pre-submit: did validation complete? Return 'continue' if the agent needs to do more work, 'stop' if complete."
96
+ }
97
+ ]
98
+ }
99
+ ]
100
+ },
101
+ "enabledPlugins": {
102
+ "code-simplifier@claude-plugins-official": true,
103
+ "pr-review-toolkit@claude-plugins-official": true
104
+ }
105
+ }
.claude/skills/alignment-review/SKILL.md ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: alignment-review
3
+ description: Review code changes for bugs and alignment with OpenEnv principles and RFCs. Use when reviewing PRs, checking code before commit, or when asked to review changes. Implements two-tier review model.
4
+ allowed-tools: Read, Grep, Glob, Bash
5
+ ---
6
+
7
+ # Alignment Review
8
+
9
+ Review code changes for alignment with OpenEnv principles using a two-tier model.
10
+
11
+ ## Instructions
12
+
13
+ 1. **Run automated checks first**:
14
+ - Execute `bash .claude/hooks/lint.sh` - capture lint issues
15
+ - Execute `bash .claude/hooks/check-debug.sh` - capture debug code
16
+
17
+ 2. **Read alignment documents**:
18
+ - `.claude/docs/PRINCIPLES.md` - design principles
19
+ - `.claude/docs/INVARIANTS.md` - system invariants
20
+
21
+ 3. **Read open RFCs**:
22
+ - Scan `rfcs/` directory for all RFC files
23
+ - Note the status of each RFC (Draft, In Review, Accepted, Implemented)
24
+ - Pay special attention to Draft and In Review RFCs - these represent active design discussions
25
+
26
+ 4. **Analyze changes** (use `git diff` or provided diff):
27
+ - Identify mechanical issues (Tier 1)
28
+ - Flag alignment concerns (Tier 2)
29
+ - Flag conflicts with open RFCs (Tier 2)
30
+
31
+ ## Tier 1: Uncontentious Issues (Fix Immediately)
32
+
33
+ These are issues to fix without human input:
34
+ - Lint failures from hook output
35
+ - Debug code from hook output (print statements, breakpoints)
36
+ - Uninitialized variables, type errors
37
+ - Missing imports, syntax errors
38
+ - Security issues (credential exposure, injection vulnerabilities)
39
+
40
+ ## Tier 2: Alignment Discussion Points
41
+
42
+ For each potential alignment concern, format as:
43
+
44
+ ```
45
+ **ALIGNMENT FLAG**: [Brief description]
46
+ - **Principle/RFC at stake**: [Which principle from PRINCIPLES.md or RFC number]
47
+ - **The concern**: [What seems misaligned or in conflict]
48
+ - **Suggested reviewer**: @darktex [pull actual reviewers based on authors of the specific line of PRINCIPLES.md and INVARIANTS.md using git blame, and/or authors of conflicting RFCs]
49
+ ```
50
+
51
+ ### Examples of Tier 2 Issues
52
+
53
+ **Principle conflicts:**
54
+ - Adding external reward computation (violates "rewards in environment")
55
+ - Client importing server code (violates client-server separation)
56
+ - New API that differs from Gymnasium pattern
57
+
58
+ **RFC conflicts (flag even for Draft/In Review RFCs):**
59
+ - Change conflicts with design proposed in an open RFC
60
+ - Change pre-empts a decision being discussed in an RFC
61
+ - Change implements something differently than an RFC proposes
62
+ - Change affects an area covered by an RFC under review
63
+
64
+ **Why flag RFC conflicts?** Even if an RFC isn't finalized, flagging conflicts helps focus design discussions. The change might be correct and the RFC might need updating, or vice versa - either way, the team should discuss.
65
+
66
+ ## Output Format
67
+
68
+ ```
69
+ ## Alignment Review Report
70
+
71
+ ### Automated Checks
72
+ - Lint: [PASS/FAIL] - [summary]
73
+ - Debug code: [CLEAN/FOUND] - [details]
74
+
75
+ ### Open RFCs Context
76
+ [List any RFCs in Draft or In Review status that might be relevant to these changes]
77
+
78
+ ### Tier 1: Fixes Required
79
+ - [ ] path/file.py:123 - [issue description]
80
+ - [ ] path/file.py:456 - [issue description]
81
+
82
+ ### Tier 2: Alignment Discussion
83
+
84
+ #### Principle Conflicts
85
+ [ALIGNMENT FLAGS for principle violations, or "None identified"]
86
+
87
+ #### RFC Conflicts
88
+ [ALIGNMENT FLAGS for RFC conflicts, or "None identified"]
89
+
90
+ ### Summary
91
+ - X mechanical issues to fix
92
+ - Y alignment points for human review
93
+ - Z RFC conflicts to discuss
94
+ ```
.claude/skills/generate-openenv-env/SKILL.md ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: generate-openenv-env
3
+ description: Generate OpenEnv environments from a concrete use case (for example, "generate an env for the library textarena"). Use when asked to design or implement a new environment under envs/ by researching a target library/API, selecting matching OpenEnv examples, asking key implementation questions, and building models/client/server/openenv.yaml. Do not use for model training or evaluation tasks.
4
+ ---
5
+
6
+ # /generate-openenv-env
7
+
8
+ Build a production-ready OpenEnv environment from a use-case prompt.
9
+
10
+ ## Execute Workflow
11
+
12
+ When invoked, execute this workflow end-to-end.
13
+
14
+ ### 1. Parse the use case and name the environment
15
+
16
+ Derive a repo path in the form `envs/<name>_env/`.
17
+
18
+ - Normalize to snake_case.
19
+ - Keep names short and domain-specific.
20
+ - Example: "generate an env for the library textarena" -> `envs/textarena_env/`.
21
+
22
+ ### 2. Research the target library/API before coding
23
+
24
+ Gather the minimum interface facts needed to implement `reset`, `step`, and state serialization.
25
+
26
+ - Search local docs/examples first.
27
+ - Search upstream docs/repo for the target library when local context is insufficient.
28
+ - Extract only implementation-critical details:
29
+ - installation/dependency requirements
30
+ - environment creation API
31
+ - action format
32
+ - observation format
33
+ - reward and done semantics
34
+ - special setup (model files, downloads, auth, etc.)
35
+
36
+ ### 3. Mine matching OpenEnv examples
37
+
38
+ Select 2-3 existing environments as implementation templates.
39
+
40
+ - Always read `references/openenv-tutorial-01-environments.md` (Part 10) and `references/openenv-docs-environment-builder.md`.
41
+ - Prefer `envs/textarena_env` for external-library wrappers with richer state.
42
+ - Add one simpler baseline (for example `envs/snake_env` or `envs/echo_env`) to keep the implementation minimal.
43
+ - Follow patterns, do not copy blindly.
44
+ - Exclude generated or vendored files when mining examples (`.venv/`, `build/`, `site-packages/`, `__pycache__/`).
45
+
46
+ For a compact checklist and mapping, read `references/env-generation-checklist.md`.
47
+
48
+ ### 4. Ask focused implementation questions
49
+
50
+ Ask only the questions that materially affect architecture. Use the question bank in `references/env-generation-checklist.md`.
51
+
52
+ Cover at least:
53
+ - action space contract
54
+ - observation fields needed by agents
55
+ - reward design and terminal conditions
56
+ - episode/session configuration knobs
57
+ - deployment target and dependency constraints
58
+
59
+ If answers are unavailable, proceed with explicit assumptions and document them.
60
+
61
+ ### 5. Choose the environment archetype
62
+
63
+ Choose one archetype before scaffolding:
64
+
65
+ - Typed step/reset environment (default): use `EnvClient` + typed `Action/Observation[/State]` models.
66
+ - MCP tool environment: use `MCPEnvironment` + `MCPToolClient` and MCP action/observation types.
67
+ - Specialized client flow (rare): only when the standard clients cannot express required behavior (for example local+remote hybrid clients).
68
+
69
+ ### 6. Scaffold the environment
70
+
71
+ Use the CLI to scaffold:
72
+
73
+ ```bash
74
+ PYTHONPATH=src uv run openenv init <name>_env --output-dir envs
75
+ ```
76
+
77
+ This generates all files with correct placeholders replaced, including `pyproject.toml`, `Dockerfile`, and `uv.lock`.
78
+
79
+ If the CLI is unavailable (import errors, missing dependencies), create the structure manually matching:
80
+
81
+ ```text
82
+ envs/<name>_env/
83
+ ├── __init__.py
84
+ ├── client.py
85
+ ├── models.py
86
+ ├── openenv.yaml
87
+ ├── pyproject.toml
88
+ └── server/
89
+ ├── __init__.py
90
+ ├── app.py
91
+ ├── <name>_environment.py
92
+ └── Dockerfile
93
+ ```
94
+
95
+ Use `assets/openenv_env_template/` as a reference for file contents when scaffolding manually.
96
+
97
+ ### 7. Implement with OpenEnv contracts
98
+
99
+ Implement these files in order:
100
+
101
+ 1. `models.py`
102
+ 2. `server/<name>_environment.py`
103
+ 3. `server/app.py`
104
+ 4. `client.py`
105
+ 5. `openenv.yaml`
106
+ 6. `README.md`
107
+
108
+ Use these standards:
109
+ - Use typed models (Action/Observation/State).
110
+ - Use `create_app(<factory_or_class>, ActionType, ObservationType, env_name=...)` in `server/app.py`. Pass a class or factory callable, not an instantiated environment.
111
+ - **Dual-import pattern** (required in `server/app.py` and `server/<name>_environment.py`): Use `try: from ..models import X / except ImportError: from models import X`. Relative imports work in-repo (`PYTHONPATH=src:envs`); bare imports work in Docker (`PYTHONPATH=/app/env`). The same pattern applies to intra-server imports (e.g., `from .foo import Bar` vs `from server.foo import Bar`).
112
+ - `client.py` uses `EnvClient[ActionType, ObservationType, State]` (three type parameters).
113
+ - Keep server logic in `server/`, keep client parsing in `client.py`.
114
+ - Expose config through environment variables when behavior is likely to vary.
115
+ - Keep reward logic inside the environment.
116
+ - Prefer reset/step signatures compatible with `Environment`:
117
+ - `reset(seed=None, episode_id=None, **kwargs)`
118
+ - `step(action, timeout_s=None, **kwargs)`
119
+ - Set `SUPPORTS_CONCURRENT_SESSIONS=True` only when isolation is real. Set `max_concurrent_envs` in `create_app` accordingly (1 when `False`, >1 when `True`).
120
+ - For MCP/tool-call UIs that send stringified JSON arguments, add action validators/parsers in `server/app.py`.
121
+ - Export public client/models symbols in `__init__.py`.
122
+ - Keep `openenv.yaml` aligned with current scaffold format (`spec_version: 1`, `name`, `type`, `runtime`, `app`, `port`).
123
+ - Avoid training/evaluation code paths in this skill.
124
+
125
+ ### 8. Validate before handoff
126
+
127
+ Run the narrowest useful checks:
128
+
129
+ ```bash
130
+ # Verify in-repo imports work (catches missing dual-import pattern)
131
+ PYTHONPATH=src:envs uv run python -c "from envs.<name>_env.server.<name>_environment import <ClassName>Environment"
132
+
133
+ # Build and validate
134
+ cd envs/<name>_env
135
+ openenv build
136
+ openenv validate --verbose
137
+ PYTHONPATH=src:envs uv run pytest envs/<name>_env -q
138
+ ```
139
+
140
+ If tests do not exist, run a smoke check:
141
+
142
+ ```bash
143
+ PYTHONPATH=src:envs uv run uvicorn envs.<name>_env.server.app:app --port 8000
144
+ curl http://localhost:8000/health
145
+ openenv validate --url http://localhost:8000
146
+ ```
147
+
148
+ ### 9. Deliver with assumptions and gaps
149
+
150
+ Report:
151
+ - files created/updated
152
+ - chosen archetype (typed vs MCP vs specialized)
153
+ - assumptions made due to missing answers
154
+ - validation commands executed and outcomes
155
+ - remaining risks or follow-up questions
156
+
157
+ ## Guardrails
158
+
159
+ - Do not route into model training/evaluation workflows.
160
+ - Do not invent library APIs; confirm against source docs.
161
+ - Do not skip reading at least one existing OpenEnv env before implementation.
162
+ - Do not copy outdated manifest patterns from older envs (`name/version/action/observation`-only manifests).
163
+ - Do not copy build artifacts or virtualenv files from example envs.
164
+ - Do not set `max_concurrent_envs > 1` unless the environment explicitly supports concurrent sessions.
.claude/skills/generate-openenv-env/agents/openai.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ interface:
2
+ display_name: "OpenEnv Env Generator"
3
+ short_description: "Generate OpenEnv environments from use cases"
4
+ default_prompt: "Use $generate-openenv-env to turn a use case into a complete OpenEnv environment scaffold."
.claude/skills/generate-openenv-env/assets/openenv_env_template/.dockerignore ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .venv
2
+ .git
3
+ .gitignore
4
+ .env
5
+ __pycache__/
6
+ *.pyc
7
+ *.pyo
8
+ *.pyd
9
+ *.pyw
10
+ *.pyz
11
+ *.pywz
12
+ *.pyzw
13
+ *.pyzwz
14
+
15
+
.claude/skills/generate-openenv-env/assets/openenv_env_template/README.md ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: __ENV_TITLE_NAME__ Environment Server
3
+ emoji: __HF_EMOJI__
4
+ colorFrom: __HF_COLOR_FROM__
5
+ colorTo: __HF_COLOR_TO__
6
+ sdk: docker
7
+ pinned: false
8
+ app_port: 8000
9
+ base_path: /web
10
+ tags:
11
+ - openenv
12
+ ---
13
+
14
+ # __ENV_TITLE_NAME__ Environment
15
+
16
+ A simple test environment that echoes back messages. Perfect for testing the env APIs as well as demonstrating environment usage patterns.
17
+
18
+ ## Quick Start
19
+
20
+ The simplest way to use the __ENV_TITLE_NAME__ environment is through the `__ENV_CLASS_NAME__Env` class:
21
+
22
+ ```python
23
+ from __ENV_NAME__ import __ENV_CLASS_NAME__Action, __ENV_CLASS_NAME__Env
24
+
25
+ try:
26
+ # Create environment from Docker image
27
+ __ENV_NAME__env = __ENV_CLASS_NAME__Env.from_docker_image("__ENV_NAME__-env:latest")
28
+
29
+ # Reset
30
+ result = __ENV_NAME__env.reset()
31
+ print(f"Reset: {result.observation.echoed_message}")
32
+
33
+ # Send multiple messages
34
+ messages = ["Hello, World!", "Testing echo", "Final message"]
35
+
36
+ for msg in messages:
37
+ result = __ENV_NAME__env.step(__ENV_CLASS_NAME__Action(message=msg))
38
+ print(f"Sent: '{msg}'")
39
+ print(f" → Echoed: '{result.observation.echoed_message}'")
40
+ print(f" → Length: {result.observation.message_length}")
41
+ print(f" → Reward: {result.reward}")
42
+
43
+ finally:
44
+ # Always clean up
45
+ __ENV_NAME__env.close()
46
+ ```
47
+
48
+ That's it! The `__ENV_CLASS_NAME__Env.from_docker_image()` method handles:
49
+ - Starting the Docker container
50
+ - Waiting for the server to be ready
51
+ - Connecting to the environment
52
+ - Container cleanup when you call `close()`
53
+
54
+ ## Building the Docker Image
55
+
56
+ Before using the environment, you need to build the Docker image:
57
+
58
+ ```bash
59
+ # From project root
60
+ docker build -t __ENV_NAME__-env:latest -f server/Dockerfile .
61
+ ```
62
+
63
+ ## Deploying to Hugging Face Spaces
64
+
65
+ You can easily deploy your OpenEnv environment to Hugging Face Spaces using the `openenv push` command:
66
+
67
+ ```bash
68
+ # From the environment directory (where openenv.yaml is located)
69
+ openenv push
70
+
71
+ # Or specify options
72
+ openenv push --namespace my-org --private
73
+ ```
74
+
75
+ The `openenv push` command will:
76
+ 1. Validate that the directory is an OpenEnv environment (checks for `openenv.yaml`)
77
+ 2. Prepare a custom build for Hugging Face Docker space (enables web interface)
78
+ 3. Upload to Hugging Face (ensuring you're logged in)
79
+
80
+ ### Prerequisites
81
+
82
+ - Authenticate with Hugging Face: The command will prompt for login if not already authenticated
83
+
84
+ ### Options
85
+
86
+ - `--directory`, `-d`: Directory containing the OpenEnv environment (defaults to current directory)
87
+ - `--repo-id`, `-r`: Repository ID in format 'username/repo-name' (defaults to 'username/env-name' from openenv.yaml)
88
+ - `--base-image`, `-b`: Base Docker image to use (overrides Dockerfile FROM)
89
+ - `--private`: Deploy the space as private (default: public)
90
+
91
+ ### Examples
92
+
93
+ ```bash
94
+ # Push to your personal namespace (defaults to username/env-name from openenv.yaml)
95
+ openenv push
96
+
97
+ # Push to a specific repository
98
+ openenv push --repo-id my-org/my-env
99
+
100
+ # Push with a custom base image
101
+ openenv push --base-image ghcr.io/meta-pytorch/openenv-base:latest
102
+
103
+ # Push as a private space
104
+ openenv push --private
105
+
106
+ # Combine options
107
+ openenv push --repo-id my-org/my-env --base-image custom-base:latest --private
108
+ ```
109
+
110
+ After deployment, your space will be available at:
111
+ `https://huggingface.co/spaces/<repo-id>`
112
+
113
+ The deployed space includes:
114
+ - **Web Interface** at `/web` - Interactive UI for exploring the environment
115
+ - **API Documentation** at `/docs` - Full OpenAPI/Swagger interface
116
+ - **Health Check** at `/health` - Container health monitoring
117
+ - **WebSocket** at `/ws` - Persistent session endpoint for low-latency interactions
118
+
119
+ ## Environment Details
120
+
121
+ ### Action
122
+ **__ENV_CLASS_NAME__Action**: Contains a single field
123
+ - `message` (str) - The message to echo back
124
+
125
+ ### Observation
126
+ **__ENV_CLASS_NAME__Observation**: Contains the echo response and metadata
127
+ - `echoed_message` (str) - The message echoed back
128
+ - `message_length` (int) - Length of the message
129
+ - `reward` (float) - Reward based on message length (length × 0.1)
130
+ - `done` (bool) - Always False for echo environment
131
+ - `metadata` (dict) - Additional info like step count
132
+
133
+ ### Reward
134
+ The reward is calculated as: `message_length × 0.1`
135
+ - "Hi" → reward: 0.2
136
+ - "Hello, World!" → reward: 1.3
137
+ - Empty message → reward: 0.0
138
+
139
+ ## Advanced Usage
140
+
141
+ ### Connecting to an Existing Server
142
+
143
+ If you already have a __ENV_TITLE_NAME__ environment server running, you can connect directly:
144
+
145
+ ```python
146
+ from __ENV_NAME__ import __ENV_CLASS_NAME__Env
147
+
148
+ # Connect to existing server
149
+ __ENV_NAME__env = __ENV_CLASS_NAME__Env(base_url="<ENV_HTTP_URL_HERE>")
150
+
151
+ # Use as normal
152
+ result = __ENV_NAME__env.reset()
153
+ result = __ENV_NAME__env.step(__ENV_CLASS_NAME__Action(message="Hello!"))
154
+ ```
155
+
156
+ Note: When connecting to an existing server, `__ENV_NAME__env.close()` will NOT stop the server.
157
+
158
+ ### Using the Context Manager
159
+
160
+ The client supports context manager usage for automatic connection management:
161
+
162
+ ```python
163
+ from __ENV_NAME__ import __ENV_CLASS_NAME__Action, __ENV_CLASS_NAME__Env
164
+
165
+ # Connect with context manager (auto-connects and closes)
166
+ with __ENV_CLASS_NAME__Env(base_url="http://localhost:8000") as env:
167
+ result = env.reset()
168
+ print(f"Reset: {result.observation.echoed_message}")
169
+ # Multiple steps with low latency
170
+ for msg in ["Hello", "World", "!"]:
171
+ result = env.step(__ENV_CLASS_NAME__Action(message=msg))
172
+ print(f"Echoed: {result.observation.echoed_message}")
173
+ ```
174
+
175
+ The client uses WebSocket connections for:
176
+ - **Lower latency**: No HTTP connection overhead per request
177
+ - **Persistent session**: Server maintains your environment state
178
+ - **Efficient for episodes**: Better for many sequential steps
179
+
180
+ ### Concurrent WebSocket Sessions
181
+
182
+ The server supports multiple concurrent WebSocket connections. To enable this,
183
+ modify `server/app.py` to use factory mode:
184
+
185
+ ```python
186
+ # In server/app.py - use factory mode for concurrent sessions
187
+ app = create_app(
188
+ __ENV_CLASS_NAME__Environment, # Pass class, not instance
189
+ __ENV_CLASS_NAME__Action,
190
+ __ENV_CLASS_NAME__Observation,
191
+ max_concurrent_envs=4, # Allow 4 concurrent sessions
192
+ )
193
+ ```
194
+
195
+ Then multiple clients can connect simultaneously:
196
+
197
+ ```python
198
+ from __ENV_NAME__ import __ENV_CLASS_NAME__Action, __ENV_CLASS_NAME__Env
199
+ from concurrent.futures import ThreadPoolExecutor
200
+
201
+ def run_episode(client_id: int):
202
+ with __ENV_CLASS_NAME__Env(base_url="http://localhost:8000") as env:
203
+ result = env.reset()
204
+ for i in range(10):
205
+ result = env.step(__ENV_CLASS_NAME__Action(message=f"Client {client_id}, step {i}"))
206
+ return client_id, result.observation.message_length
207
+
208
+ # Run 4 episodes concurrently
209
+ with ThreadPoolExecutor(max_workers=4) as executor:
210
+ results = list(executor.map(run_episode, range(4)))
211
+ ```
212
+
213
+ ## Development & Testing
214
+
215
+ ### Direct Environment Testing
216
+
217
+ Test the environment logic directly without starting the HTTP server:
218
+
219
+ ```bash
220
+ # From the server directory
221
+ python3 server/__ENV_NAME___environment.py
222
+ ```
223
+
224
+ This verifies that:
225
+ - Environment resets correctly
226
+ - Step executes actions properly
227
+ - State tracking works
228
+ - Rewards are calculated correctly
229
+
230
+ ### Running Locally
231
+
232
+ Run the server locally for development:
233
+
234
+ ```bash
235
+ uvicorn server.app:app --reload
236
+ ```
237
+
238
+ ## Project Structure
239
+
240
+ ```
241
+ __ENV_NAME__/
242
+ ├── .dockerignore # Docker build exclusions
243
+ ├── __init__.py # Module exports
244
+ ├── README.md # This file
245
+ ├── openenv.yaml # OpenEnv manifest
246
+ ├── pyproject.toml # Project metadata and dependencies
247
+ ├── uv.lock # Locked dependencies (generated)
248
+ ├── client.py # __ENV_CLASS_NAME__Env client
249
+ ├── models.py # Action and Observation models
250
+ └── server/
251
+ ├── __init__.py # Server module exports
252
+ ├── __ENV_NAME___environment.py # Core environment logic
253
+ ├── app.py # FastAPI application (HTTP + WebSocket endpoints)
254
+ └── Dockerfile # Container image definition
255
+ ```
.claude/skills/generate-openenv-env/assets/openenv_env_template/__init__.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """__ENV_TITLE_NAME__ Environment."""
8
+
9
+ from .client import __ENV_CLASS_NAME__Env
10
+ from .models import __ENV_CLASS_NAME__Action, __ENV_CLASS_NAME__Observation
11
+
12
+ __all__ = [
13
+ "__ENV_CLASS_NAME__Action",
14
+ "__ENV_CLASS_NAME__Observation",
15
+ "__ENV_CLASS_NAME__Env",
16
+ ]
.claude/skills/generate-openenv-env/assets/openenv_env_template/client.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """__ENV_TITLE_NAME__ Environment Client."""
8
+
9
+ from typing import Dict
10
+
11
+ from openenv.core import EnvClient
12
+ from openenv.core.client_types import StepResult
13
+ from openenv.core.env_server.types import State
14
+
15
+ from .models import __ENV_CLASS_NAME__Action, __ENV_CLASS_NAME__Observation
16
+
17
+
18
+ class __ENV_CLASS_NAME__Env(
19
+ EnvClient[__ENV_CLASS_NAME__Action, __ENV_CLASS_NAME__Observation, State]
20
+ ):
21
+ """
22
+ Client for the __ENV_TITLE_NAME__ Environment.
23
+
24
+ This client maintains a persistent WebSocket connection to the environment server,
25
+ enabling efficient multi-step interactions with lower latency.
26
+ Each client instance has its own dedicated environment session on the server.
27
+
28
+ Example:
29
+ >>> # Connect to a running server
30
+ >>> with __ENV_CLASS_NAME__Env(base_url="http://localhost:8000") as client:
31
+ ... result = client.reset()
32
+ ... print(result.observation.echoed_message)
33
+ ...
34
+ ... result = client.step(__ENV_CLASS_NAME__Action(message="Hello!"))
35
+ ... print(result.observation.echoed_message)
36
+
37
+ Example with Docker:
38
+ >>> # Automatically start container and connect
39
+ >>> client = __ENV_CLASS_NAME__Env.from_docker_image("__ENV_NAME__-env:latest")
40
+ >>> try:
41
+ ... result = client.reset()
42
+ ... result = client.step(__ENV_CLASS_NAME__Action(message="Test"))
43
+ ... finally:
44
+ ... client.close()
45
+ """
46
+
47
+ def _step_payload(self, action: __ENV_CLASS_NAME__Action) -> Dict:
48
+ """
49
+ Convert __ENV_CLASS_NAME__Action to JSON payload for step message.
50
+
51
+ Args:
52
+ action: __ENV_CLASS_NAME__Action instance
53
+
54
+ Returns:
55
+ Dictionary representation suitable for JSON encoding
56
+ """
57
+ return {
58
+ "message": action.message,
59
+ }
60
+
61
+ def _parse_result(self, payload: Dict) -> StepResult[__ENV_CLASS_NAME__Observation]:
62
+ """
63
+ Parse server response into StepResult[__ENV_CLASS_NAME__Observation].
64
+
65
+ Args:
66
+ payload: JSON response data from server
67
+
68
+ Returns:
69
+ StepResult with __ENV_CLASS_NAME__Observation
70
+ """
71
+ obs_data = payload.get("observation", {})
72
+ observation = __ENV_CLASS_NAME__Observation(
73
+ echoed_message=obs_data.get("echoed_message", ""),
74
+ message_length=obs_data.get("message_length", 0),
75
+ done=payload.get("done", False),
76
+ reward=payload.get("reward"),
77
+ metadata=obs_data.get("metadata", {}),
78
+ )
79
+
80
+ return StepResult(
81
+ observation=observation,
82
+ reward=payload.get("reward"),
83
+ done=payload.get("done", False),
84
+ )
85
+
86
+ def _parse_state(self, payload: Dict) -> State:
87
+ """
88
+ Parse server response into State object.
89
+
90
+ Args:
91
+ payload: JSON response from state request
92
+
93
+ Returns:
94
+ State object with episode_id and step_count
95
+ """
96
+ return State(
97
+ episode_id=payload.get("episode_id"),
98
+ step_count=payload.get("step_count", 0),
99
+ )
.claude/skills/generate-openenv-env/assets/openenv_env_template/models.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Data models for the __ENV_TITLE_NAME__ Environment.
9
+
10
+ The __ENV_NAME__ environment is a simple test environment that echoes back messages.
11
+ """
12
+
13
+ from openenv.core.env_server.types import Action, Observation
14
+ from pydantic import Field
15
+
16
+
17
+ class __ENV_CLASS_NAME__Action(Action):
18
+ """Action for the __ENV_TITLE_NAME__ environment - just a message to echo."""
19
+
20
+ message: str = Field(..., description="Message to echo back")
21
+
22
+
23
+ class __ENV_CLASS_NAME__Observation(Observation):
24
+ """Observation from the __ENV_TITLE_NAME__ environment - the echoed message."""
25
+
26
+ echoed_message: str = Field(default="", description="The echoed message")
27
+ message_length: int = Field(default=0, description="Length of the echoed message")
.claude/skills/generate-openenv-env/assets/openenv_env_template/openenv.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ spec_version: 1
2
+ name: __ENV_NAME__
3
+ type: space
4
+ runtime: fastapi
5
+ app: server.app:app
6
+ port: 8000
7
+
.claude/skills/generate-openenv-env/assets/openenv_env_template/pyproject.toml ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ [build-system]
8
+ requires = ["setuptools>=45", "wheel"]
9
+ build-backend = "setuptools.build_meta"
10
+
11
+ [project]
12
+ name = "openenv-__ENV_NAME__"
13
+ version = "0.1.0"
14
+ description = "__ENV_TITLE_NAME__ environment for OpenEnv"
15
+ requires-python = ">=3.10"
16
+ dependencies = [
17
+ # Core OpenEnv runtime (provides FastAPI server + HTTP client types)
18
+ # install from github
19
+ # "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
20
+ "openenv-core[core]>=0.2.2",
21
+ # Environment-specific dependencies
22
+ # Add all dependencies needed for your environment here
23
+ # Examples:
24
+ # "numpy>=1.19.0",
25
+ # "torch>=2.0.0",
26
+ # "gymnasium>=0.29.0",
27
+ # "openspiel>=1.0.0",
28
+ # "smolagents>=1.22.0,<2",
29
+ ]
30
+
31
+ [project.optional-dependencies]
32
+ dev = [
33
+ "pytest>=8.0.0",
34
+ "pytest-cov>=4.0.0",
35
+ ]
36
+
37
+ [project.scripts]
38
+ # Server entry point - enables running via: uv run --project . server
39
+ # or: python -m __ENV_NAME__.server.app
40
+ server = "__ENV_NAME__.server.app:main"
41
+
42
+ [tool.setuptools]
43
+ include-package-data = true
44
+ packages = ["__ENV_NAME__", "__ENV_NAME__.server"]
45
+ package-dir = { "__ENV_NAME__" = ".", "__ENV_NAME__.server" = "server" }
.claude/skills/generate-openenv-env/assets/openenv_env_template/server/Dockerfile ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # Multi-stage build using openenv-base
8
+ # This Dockerfile is flexible and works for both:
9
+ # - In-repo environments (with local OpenEnv sources)
10
+ # - Standalone environments (with openenv from PyPI/Git)
11
+ # The build script (openenv build) handles context detection and sets appropriate build args.
12
+
13
+ ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
14
+ FROM ${BASE_IMAGE} AS builder
15
+
16
+ WORKDIR /app
17
+
18
+ # Ensure git is available (required for installing dependencies from VCS)
19
+ RUN apt-get update && \
20
+ apt-get install -y --no-install-recommends git && \
21
+ rm -rf /var/lib/apt/lists/*
22
+
23
+ # Build argument to control whether we're building standalone or in-repo
24
+ ARG BUILD_MODE=in-repo
25
+ ARG ENV_NAME=__ENV_NAME__
26
+
27
+ # Copy environment code (always at root of build context)
28
+ COPY . /app/env
29
+
30
+ # For in-repo builds, openenv is already vendored in the build context
31
+ # For standalone builds, openenv will be installed via pyproject.toml
32
+ WORKDIR /app/env
33
+
34
+ # Ensure uv is available (for local builds where base image lacks it)
35
+ RUN if ! command -v uv >/dev/null 2>&1; then \
36
+ curl -LsSf https://astral.sh/uv/install.sh | sh && \
37
+ mv /root/.local/bin/uv /usr/local/bin/uv && \
38
+ mv /root/.local/bin/uvx /usr/local/bin/uvx; \
39
+ fi
40
+
41
+ # Install dependencies using uv sync
42
+ # If uv.lock exists, use it; otherwise resolve on the fly
43
+ RUN --mount=type=cache,target=/root/.cache/uv \
44
+ if [ -f uv.lock ]; then \
45
+ uv sync --frozen --no-install-project --no-editable; \
46
+ else \
47
+ uv sync --no-install-project --no-editable; \
48
+ fi
49
+
50
+ RUN --mount=type=cache,target=/root/.cache/uv \
51
+ if [ -f uv.lock ]; then \
52
+ uv sync --frozen --no-editable; \
53
+ else \
54
+ uv sync --no-editable; \
55
+ fi
56
+
57
+ # Final runtime stage
58
+ FROM ${BASE_IMAGE}
59
+
60
+ WORKDIR /app
61
+
62
+ # Copy the virtual environment from builder
63
+ COPY --from=builder /app/env/.venv /app/.venv
64
+
65
+ # Copy the environment code
66
+ COPY --from=builder /app/env /app/env
67
+
68
+ # Set PATH to use the virtual environment
69
+ ENV PATH="/app/.venv/bin:$PATH"
70
+
71
+ # Set PYTHONPATH so imports work correctly
72
+ ENV PYTHONPATH="/app/env:$PYTHONPATH"
73
+
74
+ # Health check
75
+ HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
76
+ CMD curl -f http://localhost:8000/health || exit 1
77
+
78
+ # Run the FastAPI server
79
+ # The module path is constructed to work with the /app/env structure
80
+ CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
.claude/skills/generate-openenv-env/assets/openenv_env_template/server/__ENV_NAME___environment.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ __ENV_TITLE_NAME__ Environment Implementation.
9
+
10
+ A simple test environment that echoes back messages sent to it.
11
+ Perfect for testing HTTP server infrastructure.
12
+ """
13
+
14
+ from uuid import uuid4
15
+
16
+ from openenv.core.env_server.interfaces import Environment
17
+ from openenv.core.env_server.types import State
18
+
19
+ try:
20
+ from ..models import __ENV_CLASS_NAME__Action, __ENV_CLASS_NAME__Observation
21
+ except ImportError:
22
+ from models import __ENV_CLASS_NAME__Action, __ENV_CLASS_NAME__Observation
23
+
24
+
25
+ class __ENV_CLASS_NAME__Environment(Environment):
26
+ """
27
+ A simple echo environment that echoes back messages.
28
+
29
+ This environment is designed for testing the HTTP server infrastructure.
30
+ It maintains minimal state and simply echoes back whatever message it receives.
31
+
32
+ Example:
33
+ >>> env = __ENV_CLASS_NAME__Environment()
34
+ >>> obs = env.reset()
35
+ >>> print(obs.echoed_message) # "__ENV_TITLE_NAME__ environment ready!"
36
+ >>>
37
+ >>> obs = env.step(__ENV_CLASS_NAME__Action(message="Hello"))
38
+ >>> print(obs.echoed_message) # "Hello"
39
+ >>> print(obs.message_length) # 5
40
+ """
41
+
42
+ # Set to True only when your environment isolates state between instances
43
+ # and max_concurrent_envs > 1 in server/app.py.
44
+ SUPPORTS_CONCURRENT_SESSIONS: bool = False
45
+
46
+ def __init__(self):
47
+ """Initialize the __ENV_NAME__ environment."""
48
+ self._state = State(episode_id=str(uuid4()), step_count=0)
49
+ self._reset_count = 0
50
+
51
+ def reset(
52
+ self, seed=None, episode_id=None, **kwargs
53
+ ) -> __ENV_CLASS_NAME__Observation:
54
+ """
55
+ Reset the environment.
56
+
57
+ Args:
58
+ seed: Optional seed for deterministic resets
59
+ episode_id: Optional externally-provided episode id
60
+ **kwargs: Additional reset arguments
61
+
62
+ Returns:
63
+ __ENV_CLASS_NAME__Observation with a ready message
64
+ """
65
+ self._state = State(episode_id=episode_id or str(uuid4()), step_count=0)
66
+ self._reset_count += 1
67
+
68
+ return __ENV_CLASS_NAME__Observation(
69
+ echoed_message="__ENV_TITLE_NAME__ environment ready!",
70
+ message_length=0,
71
+ done=False,
72
+ reward=0.0,
73
+ )
74
+
75
+ def step(self, action: __ENV_CLASS_NAME__Action) -> __ENV_CLASS_NAME__Observation: # type: ignore[override]
76
+ """
77
+ Execute a step in the environment by echoing the message.
78
+
79
+ Args:
80
+ action: __ENV_CLASS_NAME__Action containing the message to echo
81
+
82
+ Returns:
83
+ __ENV_CLASS_NAME__Observation with the echoed message and its length
84
+ """
85
+ self._state.step_count += 1
86
+
87
+ message = action.message
88
+ length = len(message)
89
+
90
+ # Simple reward: longer messages get higher rewards
91
+ reward = length * 0.1
92
+
93
+ return __ENV_CLASS_NAME__Observation(
94
+ echoed_message=message,
95
+ message_length=length,
96
+ done=False,
97
+ reward=reward,
98
+ metadata={"original_message": message, "step": self._state.step_count},
99
+ )
100
+
101
+ @property
102
+ def state(self) -> State:
103
+ """
104
+ Get the current environment state.
105
+
106
+ Returns:
107
+ Current State with episode_id and step_count
108
+ """
109
+ return self._state
.claude/skills/generate-openenv-env/assets/openenv_env_template/server/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """__ENV_TITLE_NAME__ environment server components."""
8
+
9
+ from .__ENV_NAME___environment import __ENV_CLASS_NAME__Environment
10
+
11
+ __all__ = ["__ENV_CLASS_NAME__Environment"]
.claude/skills/generate-openenv-env/assets/openenv_env_template/server/app.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ FastAPI application for the __ENV_TITLE_NAME__ Environment.
9
+
10
+ This module creates an HTTP server that exposes the __ENV_CLASS_NAME__Environment
11
+ over HTTP and WebSocket endpoints, compatible with EnvClient.
12
+
13
+ Endpoints:
14
+ - POST /reset: Reset the environment
15
+ - POST /step: Execute an action
16
+ - GET /state: Get current environment state
17
+ - GET /schema: Get action/observation schemas
18
+ - WS /ws: WebSocket endpoint for persistent sessions
19
+
20
+ Usage:
21
+ # Development (with auto-reload):
22
+ uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
23
+
24
+ # Production:
25
+ uvicorn server.app:app --host 0.0.0.0 --port 8000 --workers 4
26
+
27
+ # Or run directly:
28
+ python -m server.app
29
+ """
30
+
31
+ try:
32
+ from openenv.core.env_server.http_server import create_app
33
+ except ImportError as e: # pragma: no cover
34
+ raise ImportError(
35
+ "openenv is required for the web interface. Install dependencies with '\n uv sync\n'"
36
+ ) from e
37
+
38
+ try:
39
+ from ..models import __ENV_CLASS_NAME__Action, __ENV_CLASS_NAME__Observation
40
+ from .__ENV_NAME___environment import __ENV_CLASS_NAME__Environment
41
+ except ImportError:
42
+ from models import __ENV_CLASS_NAME__Action, __ENV_CLASS_NAME__Observation
43
+ from server.__ENV_NAME___environment import __ENV_CLASS_NAME__Environment
44
+
45
+
46
+ # Create the app with web interface and README integration
47
+ app = create_app(
48
+ __ENV_CLASS_NAME__Environment,
49
+ __ENV_CLASS_NAME__Action,
50
+ __ENV_CLASS_NAME__Observation,
51
+ env_name="__ENV_NAME__",
52
+ max_concurrent_envs=1, # increase this number to allow more concurrent WebSocket sessions
53
+ )
54
+
55
+
56
+ def main(host: str = "0.0.0.0", port: int = 8000):
57
+ """
58
+ Entry point for direct execution via uv run or python -m.
59
+
60
+ This function enables running the server without Docker:
61
+ uv run --project . server
62
+ uv run --project . server --port 8001
63
+ python -m __ENV_NAME__.server.app
64
+
65
+ Args:
66
+ host: Host address to bind to (default: "0.0.0.0")
67
+ port: Port number to listen on (default: 8000)
68
+
69
+ For production deployments, consider using uvicorn directly with
70
+ multiple workers:
71
+ uvicorn __ENV_NAME__.server.app:app --workers 4
72
+ """
73
+ import uvicorn
74
+
75
+ uvicorn.run(app, host=host, port=port)
76
+
77
+
78
+ if __name__ == "__main__":
79
+ import argparse
80
+
81
+ parser = argparse.ArgumentParser()
82
+ parser.add_argument("--port", type=int, default=8000)
83
+ args = parser.parse_args()
84
+ main(port=args.port)
.claude/skills/generate-openenv-env/assets/openenv_env_template/server/requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ openenv-core[core]>=0.2.2
2
+ fastapi>=0.115.0
3
+ uvicorn>=0.24.0