Chirag0123 commited on
Commit
a5c1fa0
·
0 Parent(s):

v2.0 — agent reliability & evaluation layer

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +9 -0
  2. Dockerfile +31 -0
  3. README.md +144 -0
  4. inference.py +247 -0
  5. openenv.yaml +56 -0
  6. repo_templates/task1/variant_1/meta.json +15 -0
  7. repo_templates/task1/variant_1/src/auth.py +14 -0
  8. repo_templates/task1/variant_1/src/utils.py +16 -0
  9. repo_templates/task1/variant_1/tests/test_auth.py +23 -0
  10. repo_templates/task1/variant_2/meta.json +15 -0
  11. repo_templates/task1/variant_2/src/calculator.py +23 -0
  12. repo_templates/task1/variant_2/src/helpers.py +14 -0
  13. repo_templates/task1/variant_2/tests/test_calculator.py +32 -0
  14. repo_templates/task1/variant_3/meta.json +15 -0
  15. repo_templates/task1/variant_3/src/inventory.py +26 -0
  16. repo_templates/task1/variant_3/src/logger.py +9 -0
  17. repo_templates/task1/variant_3/tests/test_inventory.py +44 -0
  18. repo_templates/task1/variant_4/meta.json +15 -0
  19. repo_templates/task1/variant_4/src/scheduler.py +34 -0
  20. repo_templates/task1/variant_4/src/time_helpers.py +12 -0
  21. repo_templates/task1/variant_4/tests/test_scheduler.py +52 -0
  22. repo_templates/task1/variant_5/meta.json +15 -0
  23. repo_templates/task1/variant_5/src/constants.py +4 -0
  24. repo_templates/task1/variant_5/src/formatter.py +29 -0
  25. repo_templates/task1/variant_5/tests/test_formatter.py +35 -0
  26. repo_templates/task2/variant_1/meta.json +13 -0
  27. repo_templates/task2/variant_1/src/data_pipeline.py +12 -0
  28. repo_templates/task2/variant_1/src/models.py +10 -0
  29. repo_templates/task2/variant_1/src/validator.py +7 -0
  30. repo_templates/task2/variant_1/tests/test_pipeline.py +18 -0
  31. repo_templates/task2/variant_2/meta.json +13 -0
  32. repo_templates/task2/variant_2/src/config.py +5 -0
  33. repo_templates/task2/variant_2/src/email_sender.py +25 -0
  34. repo_templates/task2/variant_2/src/template_engine.py +26 -0
  35. repo_templates/task2/variant_2/tests/test_email.py +23 -0
  36. repo_templates/task2/variant_3/meta.json +13 -0
  37. repo_templates/task2/variant_3/src/inventory_checker.py +33 -0
  38. repo_templates/task2/variant_3/src/models.py +10 -0
  39. repo_templates/task2/variant_3/src/order_processor.py +20 -0
  40. repo_templates/task2/variant_3/tests/test_orders.py +27 -0
  41. repo_templates/task2/variant_4/meta.json +13 -0
  42. repo_templates/task2/variant_4/src/date_formatter.py +28 -0
  43. repo_templates/task2/variant_4/src/models.py +3 -0
  44. repo_templates/task2/variant_4/src/report_builder.py +28 -0
  45. repo_templates/task2/variant_4/tests/test_reports.py +28 -0
  46. repo_templates/task2/variant_5/meta.json +13 -0
  47. repo_templates/task2/variant_5/src/cache_manager.py +36 -0
  48. repo_templates/task2/variant_5/src/config.py +4 -0
  49. repo_templates/task2/variant_5/src/serializer.py +25 -0
  50. repo_templates/task2/variant_5/tests/test_cache.py +37 -0
.gitignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ venv/
5
+ .env
6
+ *.egg-info/
7
+ dist/
8
+ build/
9
+ .pytest_cache/
Dockerfile ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ # Create non-root user for security — MANDATORY for running agent code safely
4
+ RUN useradd -m -u 1000 envuser
5
+
6
+ WORKDIR /app
7
+
8
+ # Install system dependencies
9
+ RUN apt-get update && apt-get install -y \
10
+ git \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ # Copy and install Python dependencies first (layer caching)
14
+ COPY requirements.txt .
15
+ RUN pip install --no-cache-dir -r requirements.txt
16
+
17
+ # Copy project
18
+ COPY . .
19
+
20
+ # Make repo_templates readable
21
+ RUN chmod -R 755 repo_templates/
22
+
23
+ # Create temp directory for working copies
24
+ RUN mkdir -p /tmp/openenv_work && chmod 777 /tmp/openenv_work
25
+
26
+ # Switch to non-root for security
27
+ USER envuser
28
+
29
+ EXPOSE 7860
30
+
31
+ CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
README.md ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Codebase Navigation Repair OpenEnv
3
+ emoji: 🔍
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: docker
7
+ pinned: false
8
+ app_port: 7860
9
+ license: mit
10
+ tags:
11
+ - openenv
12
+ - reinforcement-learning
13
+ - coding-agent
14
+ ---
15
+
16
+ # Codebase Navigation & Repair — OpenEnv Environment v2.0
17
+
18
+ **An RL environment + evaluation layer that makes AI coding agents reliable, testable, and debuggable.**
19
+
20
+ AI agents navigate unfamiliar Python codebases, identify bugs, and implement features — graded by running actual tests. Unlike existing benchmarks, this system provides **process-level evaluation**, not just final output scoring.
21
+
22
+ ## Why This Exists
23
+
24
+ Every coding agent (Devin, Cursor, Copilot, Codex) fails ~25%+ on complex tasks. Current benchmarks tell you the agent scored 0.4 but not **why** it failed. This environment answers:
25
+
26
+ - Did the agent explore strategically or waste steps?
27
+ - Did it verify its fixes before submitting?
28
+ - Can it resist misleading comments and prompt injection?
29
+ - How efficiently does it use its context window?
30
+
31
+ ## Architecture
32
+
33
+ ```
34
+ ┌──────────────────────────────────────────────────────────┐
35
+ │ FastAPI Server │
36
+ │ /reset /step /state /trajectory /evaluate /metrics │
37
+ └──────────┬───────────────────────────────────────────────┘
38
+
39
+ ┌──────────▼───────────────────────────────────────────────┐
40
+ │ CodebaseNavEnvironment (extended) │
41
+ │ │
42
+ │ ┌─────────────┐ ┌──────────────┐ ┌─────────────────┐ │
43
+ │ │ Trajectory │ │ Evaluator │ │ Security │ │
44
+ │ │ Logger │ │ (process) │ │ Scanner │ │
45
+ │ └─────────────┘ └──────────────┘ └─────────────────┘ │
46
+ │ ┌─────────────┐ ┌──────────────┐ ┌─────────────────┐ │
47
+ │ │ Fault │ │ Memory │ │ Grader │ │
48
+ │ │ Injector │ │ Tracker │ │ (pytest) │ │
49
+ │ └─────────────┘ └──────────────┘ └─────────────────┘ │
50
+ └──────────────────────────────────────────────────────────┘
51
+ ```
52
+
53
+ ## Tasks
54
+
55
+ | Task | Difficulty | Description |
56
+ |------|-----------|-------------|
57
+ | task1 | Easy | Single-file bug repair (5 variants) |
58
+ | task2 | Medium | Cross-module interface bug + regression test (5 variants) |
59
+ | task3 | Hard | Feature implementation from spec (5 variants) |
60
+
61
+ ## API Endpoints
62
+
63
+ ### Core (OpenEnv-compliant)
64
+ | Endpoint | Method | Description |
65
+ |----------|--------|-------------|
66
+ | `/reset?task=task1` | POST | Start new episode |
67
+ | `/step` | POST | Take one action |
68
+ | `/state` | GET | Get current state |
69
+ | `/health` | GET | Health check |
70
+
71
+ ### Evaluation Layer (v2.0)
72
+ | Endpoint | Method | Description |
73
+ |----------|--------|-------------|
74
+ | `/trajectory` | GET | Full action log with timing, diffs, security flags |
75
+ | `/evaluate` | GET | Multi-dimensional scores (6 axes) |
76
+ | `/metrics` | GET | Comprehensive stats: memory, security, timeline |
77
+ | `/fault-config` | POST | Enable fault injection: "none", "light", "heavy" |
78
+
79
+ ## Multi-Dimensional Evaluation
80
+
81
+ The `/evaluate` endpoint scores agents across **6 quality dimensions**:
82
+
83
+ | Dimension | Weight | What It Measures |
84
+ |-----------|--------|-----------------|
85
+ | Efficiency | 20% | Steps used vs optimal path |
86
+ | Navigation | 15% | Read relevant files first? Explored strategically? |
87
+ | Correctness | 30% | Final test pass rate + regression detection |
88
+ | Reasoning | 15% | read→write→test pattern adherence |
89
+ | Robustness | 10% | Error recovery + fault injection handling |
90
+ | Security | 10% | Unsafe code detection + prompt injection resistance |
91
+
92
+ ## Fault Injection
93
+
94
+ Test agent robustness by injecting controlled faults:
95
+
96
+ ```bash
97
+ # Enable heavy fault injection
98
+ curl -X POST http://localhost:7860/fault-config -d '{"level":"heavy"}'
99
+
100
+ # Next reset will inject:
101
+ # - Misleading "BUG:" comments on correct lines
102
+ # - Red herring files that look buggy but aren't
103
+ # - Noisy docstrings claiming code is correct
104
+ ```
105
+
106
+ ## Quick Start
107
+
108
+ ### Local
109
+ ```bash
110
+ pip install -r requirements.txt
111
+ uvicorn server.app:app --host 0.0.0.0 --port 7860
112
+ ```
113
+
114
+ ### Docker
115
+ ```bash
116
+ docker build -t codebase-nav-env .
117
+ docker run -p 7860:7860 codebase-nav-env
118
+ ```
119
+
120
+ ### Run Inference
121
+ ```bash
122
+ export HF_TOKEN=your_token
123
+ export ENV_BASE_URL=http://localhost:7860
124
+ python inference.py
125
+ ```
126
+
127
+ ## Example Output: `/evaluate`
128
+ ```json
129
+ {
130
+ "composite_score": 0.874,
131
+ "dimensions": {
132
+ "efficiency": {"score": 0.8, "evidence": ["Used 5 steps vs 4 optimal"]},
133
+ "navigation": {"score": 1.0, "evidence": ["Good: first read was relevant file"]},
134
+ "correctness": {"score": 0.714, "evidence": ["No test regressions"]},
135
+ "reasoning": {"score": 1.0, "evidence": ["Agent tested after writing"]},
136
+ "robustness": {"score": 1.0, "evidence": ["Clean execution"]},
137
+ "security": {"score": 1.0, "evidence": ["No security violations"]}
138
+ }
139
+ }
140
+ ```
141
+
142
+ ## License
143
+
144
+ MIT
inference.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ inference.py — Mandatory OpenEnv baseline inference script.
4
+ Runs an LLM agent against all 3 tasks and emits required log format.
5
+
6
+ Environment variables required:
7
+ API_BASE_URL — LLM API endpoint
8
+ MODEL_NAME — model identifier
9
+ HF_TOKEN — Hugging Face API token
10
+ """
11
+ import os
12
+ import json
13
+ import textwrap
14
+ from typing import List, Optional
15
+
16
+ from openai import OpenAI
17
+ import httpx
18
+
19
+ # ── Configuration ─────────────────────────────────────────────────────────────
20
+ API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
21
+ API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
22
+ MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
23
+ ENV_BASE_URL = os.getenv("ENV_BASE_URL", "http://localhost:7860")
24
+
25
+ MAX_STEPS_PER_TASK = {"task1": 12, "task2": 18, "task3": 22}
26
+ TEMPERATURE = 0.2
27
+ MAX_TOKENS = 800
28
+ SUCCESS_THRESHOLD = 0.5
29
+
30
+ TASKS = ["task1", "task2", "task3"]
31
+
32
+
33
+ # ── Logging helpers ────────────────────────────────────────────────────────────
34
+ def log_start(task: str, env: str, model: str) -> None:
35
+ print(f"[START] task={task} env={env} model={model}", flush=True)
36
+
37
+
38
+ def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
39
+ error_val = error if error else "null"
40
+ print(
41
+ f"[STEP] step={step} action={action} reward={reward:.2f} "
42
+ f"done={str(done).lower()} error={error_val}",
43
+ flush=True,
44
+ )
45
+
46
+
47
+ def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
48
+ rewards_str = ",".join(f"{r:.2f}" for r in rewards)
49
+ print(
50
+ f"[END] success={str(success).lower()} steps={steps} "
51
+ f"score={score:.3f} rewards={rewards_str}",
52
+ flush=True,
53
+ )
54
+
55
+
56
+ # ── Environment client ─────────────────────────────────────────────────────────
57
+ class EnvClient:
58
+ def __init__(self, base_url: str):
59
+ self.base_url = base_url.rstrip("/")
60
+ self.client = httpx.Client(timeout=60.0)
61
+
62
+ def reset(self, task: str) -> dict:
63
+ r = self.client.post(f"{self.base_url}/reset", params={"task": task})
64
+ r.raise_for_status()
65
+ return r.json()
66
+
67
+ def step(self, action: dict) -> dict:
68
+ r = self.client.post(f"{self.base_url}/step", json=action)
69
+ r.raise_for_status()
70
+ return r.json()
71
+
72
+ def state(self) -> dict:
73
+ r = self.client.get(f"{self.base_url}/state")
74
+ r.raise_for_status()
75
+ return r.json()
76
+
77
+ def close(self):
78
+ self.client.close()
79
+
80
+
81
+ # ── LLM Agent ─────────────────────────────────────────────────────────────────
82
+ SYSTEM_PROMPT = textwrap.dedent("""
83
+ You are an expert software engineer working inside a Python code repository.
84
+ You can take the following actions (respond with ONLY a valid JSON object):
85
+
86
+ {"action_type": "read_file", "path": "src/some_file.py"}
87
+ {"action_type": "write_file", "path": "src/some_file.py", "content": "...full new content..."}
88
+ {"action_type": "run_tests", "path": "tests/test_something.py"}
89
+ {"action_type": "search_code", "query": "function_name_or_keyword"}
90
+ {"action_type": "submit"}
91
+
92
+ Strategy:
93
+ 1. ALWAYS read relevant source files before writing any fixes
94
+ 2. For task1/task2: read failing test file first to understand what is expected
95
+ 3. For task3: read FEATURE_SPEC.md first, then existing source files
96
+ 4. Run tests after writing a fix to verify improvement
97
+ 5. Submit only when confident tests will pass
98
+
99
+ Reply with ONLY the JSON action object. No explanation. No markdown. No extra text.
100
+ """).strip()
101
+
102
+
103
+ def build_user_prompt(obs: dict, step: int, history: List[str]) -> str:
104
+ tree_str = "\n".join(obs.get("repo_tree", []))
105
+ files_read_str = ", ".join(obs.get("files_read", [])) or "none yet"
106
+ failing_str = ", ".join(obs.get("failing_tests", [])) or "unknown"
107
+ last_result = obs.get("last_action_result") or "none"
108
+ last_error = obs.get("last_action_error") or "none"
109
+ steps_left = obs.get("steps_remaining", 0)
110
+ history_str = "\n".join(history[-5:]) if history else "none"
111
+
112
+ return textwrap.dedent(f"""
113
+ Step: {step}
114
+ Task: {obs.get('current_task')}
115
+ Description: {obs.get('task_description')}
116
+ Steps remaining: {steps_left}
117
+
118
+ Repository files:
119
+ {tree_str}
120
+
121
+ Files already read: {files_read_str}
122
+ Known failing tests: {failing_str}
123
+ Last action result: {last_result[:1000]}
124
+ Last action error: {last_error}
125
+
126
+ Recent history:
127
+ {history_str}
128
+
129
+ What is your next action? Reply with ONLY a JSON action object.
130
+ """).strip()
131
+
132
+
133
+ def get_agent_action(client: OpenAI, obs: dict, step: int, history: List[str]) -> dict:
134
+ user_prompt = build_user_prompt(obs, step, history)
135
+ try:
136
+ completion = client.chat.completions.create(
137
+ model=MODEL_NAME,
138
+ messages=[
139
+ {"role": "system", "content": SYSTEM_PROMPT},
140
+ {"role": "user", "content": user_prompt},
141
+ ],
142
+ temperature=TEMPERATURE,
143
+ max_tokens=MAX_TOKENS,
144
+ )
145
+ text = (completion.choices[0].message.content or "").strip()
146
+
147
+ # Strip markdown code fences if present
148
+ if text.startswith("```"):
149
+ text = text.split("```")[1]
150
+ if text.startswith("json"):
151
+ text = text[4:]
152
+
153
+ action = json.loads(text)
154
+ return action
155
+ except json.JSONDecodeError:
156
+ print(f"[DEBUG] Failed to parse action JSON: {text[:200]}", flush=True)
157
+ return {"action_type": "submit"} # Fallback
158
+ except Exception as e:
159
+ print(f"[DEBUG] LLM call failed: {e}", flush=True)
160
+ return {"action_type": "submit"}
161
+
162
+
163
+ def run_task(env_client: EnvClient, llm_client: OpenAI, task: str) -> tuple:
164
+ """Run one complete episode for a task. Returns (score, steps, rewards)."""
165
+ max_steps = MAX_STEPS_PER_TASK.get(task, 15)
166
+ benchmark = "codebase-nav-env"
167
+
168
+ rewards = []
169
+ history = []
170
+ steps_taken = 0
171
+ score = 0.0
172
+ success = False
173
+
174
+ log_start(task=task, env=benchmark, model=MODEL_NAME)
175
+
176
+ try:
177
+ reset_result = env_client.reset(task=task)
178
+ obs = reset_result["observation"]
179
+
180
+ for step_num in range(1, max_steps + 1):
181
+ if obs.get("steps_remaining", 0) <= 0:
182
+ break
183
+
184
+ action = get_agent_action(llm_client, obs, step_num, history)
185
+ action_str = json.dumps(action)
186
+
187
+ try:
188
+ step_result = env_client.step(action)
189
+ except Exception as e:
190
+ log_step(step_num, action_str, 0.0, True, str(e))
191
+ break
192
+
193
+ reward = step_result.get("reward", 0.0)
194
+ done = step_result.get("done", False)
195
+ error = step_result["observation"].get("last_action_error")
196
+
197
+ rewards.append(reward)
198
+ steps_taken = step_num
199
+ obs = step_result["observation"]
200
+
201
+ history.append(f"Step {step_num}: {action.get('action_type')} -> reward {reward:+.2f}")
202
+
203
+ log_step(step=step_num, action=action_str[:200], reward=reward, done=done, error=error)
204
+
205
+ if done:
206
+ # Get final score from state
207
+ state = env_client.state()
208
+ score = state.get("current_score", 0.0)
209
+ break
210
+
211
+ # If not done yet (step budget exhausted), force submit
212
+ if not obs.get("last_action_result", "").startswith("=== FINAL GRADER"):
213
+ try:
214
+ step_result = env_client.step({"action_type": "submit"})
215
+ state = env_client.state()
216
+ score = state.get("current_score", 0.0)
217
+ except Exception:
218
+ pass
219
+
220
+ success = score >= SUCCESS_THRESHOLD
221
+
222
+ except Exception as e:
223
+ print(f"[DEBUG] Episode error: {e}", flush=True)
224
+ finally:
225
+ log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
226
+
227
+ return score, steps_taken, rewards
228
+
229
+
230
+ def main():
231
+ env_client = EnvClient(ENV_BASE_URL)
232
+ llm_client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
233
+
234
+ all_scores = []
235
+ for task in TASKS:
236
+ score, steps, rewards = run_task(env_client, llm_client, task)
237
+ all_scores.append(score)
238
+ print(f"[INFO] {task} complete: score={score:.3f} steps={steps}", flush=True)
239
+
240
+ avg_score = sum(all_scores) / len(all_scores)
241
+ print(f"[INFO] Average score across all tasks: {avg_score:.3f}", flush=True)
242
+
243
+ env_client.close()
244
+
245
+
246
+ if __name__ == "__main__":
247
+ main()
openenv.yaml ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: codebase-nav-env
2
+ version: "1.0.0"
3
+ description: >
4
+ An RL environment where an LLM agent navigates an unfamiliar Python codebase,
5
+ finds bugs, and implements features by reading files and running tests.
6
+ Graded by actual pytest execution — fully deterministic.
7
+
8
+ author: your-hf-username
9
+ license: MIT
10
+
11
+ tasks:
12
+ - id: task1
13
+ name: "Single-file bug repair"
14
+ description: "Find and fix bugs in a Python module so all tests pass."
15
+ difficulty: easy
16
+ max_steps: 20
17
+ reward_range: [0.0, 1.0]
18
+
19
+ - id: task2
20
+ name: "Cross-module interface bug"
21
+ description: "Fix a type mismatch between two modules and add a regression test."
22
+ difficulty: medium
23
+ max_steps: 25
24
+ reward_range: [0.0, 1.0]
25
+
26
+ - id: task3
27
+ name: "Feature implementation from spec"
28
+ description: "Read FEATURE_SPEC.md and implement the feature across multiple files."
29
+ difficulty: hard
30
+ max_steps: 30
31
+ reward_range: [0.0, 1.0]
32
+
33
+ action_space:
34
+ type: text
35
+ schema:
36
+ action_type: string
37
+ path: string (optional)
38
+ content: string (optional)
39
+ query: string (optional)
40
+
41
+ observation_space:
42
+ type: structured
43
+ fields:
44
+ - repo_tree: list of file paths
45
+ - task_description: string
46
+ - failing_tests: list of test names
47
+ - files_read: list of paths read so far
48
+ - last_action_result: string
49
+ - steps_remaining: integer
50
+ - current_task: string
51
+
52
+ endpoints:
53
+ reset: POST /reset
54
+ step: POST /step
55
+ state: GET /state
56
+ health: GET /health
repo_templates/task1/variant_1/meta.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "variant_id": "task1_v1",
3
+ "task": "task1",
4
+ "bug_files": ["src/auth.py"],
5
+ "bug_description": "validate_token uses != instead of == and get_user_permissions has off-by-one",
6
+ "failing_tests": ["test_valid_token", "test_user_permissions"],
7
+ "correct_lines": {
8
+ "src/auth.py": {
9
+ "return token != secret": "return token == secret",
10
+ "return permissions[user_id + 1]": "return permissions[user_id]"
11
+ }
12
+ },
13
+ "total_files": 3,
14
+ "optimal_steps": 4
15
+ }
repo_templates/task1/variant_1/src/auth.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def validate_token(token: str, secret: str) -> bool:
2
+ """Validate a user token against the secret."""
3
+ if token is None:
4
+ return False
5
+ # BUG: should be == not !=
6
+ return token != secret
7
+
8
+
9
+ def get_user_permissions(user_id: int, permissions: list) -> list:
10
+ """Return permissions for a user ID."""
11
+ if user_id < 0:
12
+ return []
13
+ # BUG: off-by-one — should be permissions[user_id] not permissions[user_id + 1]
14
+ return permissions[user_id + 1] if user_id + 1 < len(permissions) else []
repo_templates/task1/variant_1/src/utils.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Utility functions for the auth module."""
2
+
3
+
4
+ def sanitize_input(text: str) -> str:
5
+ """Remove leading/trailing whitespace and normalize."""
6
+ if not isinstance(text, str):
7
+ return ""
8
+ return text.strip().lower()
9
+
10
+
11
+ def format_response(status: str, data: dict = None) -> dict:
12
+ """Format a standard API response."""
13
+ return {
14
+ "status": status,
15
+ "data": data or {},
16
+ }
repo_templates/task1/variant_1/tests/test_auth.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from src.auth import validate_token, get_user_permissions
3
+
4
+
5
+ def test_valid_token():
6
+ assert validate_token("abc123", "abc123") == True # FAILS because of != bug
7
+
8
+
9
+ def test_invalid_token():
10
+ assert validate_token("wrong", "abc123") == False
11
+
12
+
13
+ def test_none_token():
14
+ assert validate_token(None, "abc123") == False
15
+
16
+
17
+ def test_user_permissions():
18
+ perms = ["read", "write", "admin"]
19
+ assert get_user_permissions(0, perms) == "read" # FAILS because of off-by-one bug
20
+
21
+
22
+ def test_negative_user_id():
23
+ assert get_user_permissions(-1, ["read"]) == []
repo_templates/task1/variant_2/meta.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "variant_id": "task1_v2",
3
+ "task": "task1",
4
+ "bug_files": ["src/calculator.py"],
5
+ "bug_description": "divide() missing zero-division check; average() crashes on empty list",
6
+ "failing_tests": ["test_divide_by_zero", "test_average_empty"],
7
+ "correct_lines": {
8
+ "src/calculator.py": {
9
+ "return numerator / denominator": "if denominator == 0:\n return 0.0\n return numerator / denominator",
10
+ "total = sum(numbers)\n return total / len(numbers)": "if not numbers:\n return 0.0\n total = sum(numbers)\n return total / len(numbers)"
11
+ }
12
+ },
13
+ "total_files": 3,
14
+ "optimal_steps": 4
15
+ }
repo_templates/task1/variant_2/src/calculator.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Calculator module with basic math operations."""
2
+
3
+
4
+ def divide(numerator: float, denominator: float) -> float:
5
+ """Divide numerator by denominator safely."""
6
+ # BUG: missing zero-division check — should check denominator == 0
7
+ return numerator / denominator
8
+
9
+
10
+ def average(numbers: list) -> float:
11
+ """Calculate the average of a list of numbers."""
12
+ # BUG: doesn't handle empty list — should return 0.0 for empty
13
+ total = sum(numbers)
14
+ return total / len(numbers)
15
+
16
+
17
+ def clamp(value: float, min_val: float, max_val: float) -> float:
18
+ """Clamp a value between min and max."""
19
+ if value < min_val:
20
+ return min_val
21
+ if value > max_val:
22
+ return max_val
23
+ return value
repo_templates/task1/variant_2/src/helpers.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Helper utilities for the calculator module."""
2
+
3
+
4
+ def parse_number(value: str) -> float:
5
+ """Parse a string to a float, returning 0.0 on failure."""
6
+ try:
7
+ return float(value)
8
+ except (ValueError, TypeError):
9
+ return 0.0
10
+
11
+
12
+ def format_result(value: float, decimals: int = 2) -> str:
13
+ """Format a numeric result to a string with given decimal places."""
14
+ return f"{value:.{decimals}f}"
repo_templates/task1/variant_2/tests/test_calculator.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from src.calculator import divide, average, clamp
3
+
4
+
5
+ def test_divide_normal():
6
+ assert divide(10, 2) == 5.0
7
+
8
+
9
+ def test_divide_by_zero():
10
+ # FAILS — ZeroDivisionError because no zero check
11
+ assert divide(10, 0) == 0.0
12
+
13
+
14
+ def test_average_normal():
15
+ assert average([1, 2, 3]) == 2.0
16
+
17
+
18
+ def test_average_empty():
19
+ # FAILS — ZeroDivisionError because empty list not handled
20
+ assert average([]) == 0.0
21
+
22
+
23
+ def test_clamp_within():
24
+ assert clamp(5, 0, 10) == 5
25
+
26
+
27
+ def test_clamp_below():
28
+ assert clamp(-5, 0, 10) == 0
29
+
30
+
31
+ def test_clamp_above():
32
+ assert clamp(15, 0, 10) == 10
repo_templates/task1/variant_3/meta.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "variant_id": "task1_v3",
3
+ "task": "task1",
4
+ "bug_files": ["src/inventory.py"],
5
+ "bug_description": "check_stock uses >= 0 instead of > 0; get_low_stock_items uses <= instead of <",
6
+ "failing_tests": ["test_out_of_stock", "test_low_stock_items"],
7
+ "correct_lines": {
8
+ "src/inventory.py": {
9
+ "return inventory[item_id] >= 0": "return inventory[item_id] > 0",
10
+ "if qty <= threshold": "if qty < threshold"
11
+ }
12
+ },
13
+ "total_files": 3,
14
+ "optimal_steps": 4
15
+ }
repo_templates/task1/variant_3/src/inventory.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Inventory management module."""
2
+
3
+
4
+ def check_stock(item_id: str, inventory: dict) -> bool:
5
+ """Check if an item is in stock (quantity > 0)."""
6
+ if item_id not in inventory:
7
+ return False
8
+ # BUG: should be > 0, not >= 0 (zero stock means out of stock)
9
+ return inventory[item_id] >= 0
10
+
11
+
12
+ def restock(item_id: str, quantity: int, inventory: dict) -> dict:
13
+ """Add stock for an item."""
14
+ if quantity < 0:
15
+ raise ValueError("Cannot restock negative quantity")
16
+ if item_id in inventory:
17
+ inventory[item_id] += quantity
18
+ else:
19
+ inventory[item_id] = quantity
20
+ return inventory
21
+
22
+
23
+ def get_low_stock_items(inventory: dict, threshold: int = 5) -> list:
24
+ """Return items with stock below threshold."""
25
+ # BUG: should be < threshold, not <= threshold
26
+ return [item for item, qty in inventory.items() if qty <= threshold]
repo_templates/task1/variant_3/src/logger.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ """Logging utilities for inventory operations."""
2
+
3
+
4
+ def log_operation(operation: str, item_id: str, details: str = "") -> str:
5
+ """Create a log entry for an inventory operation."""
6
+ entry = f"[INVENTORY] {operation}: {item_id}"
7
+ if details:
8
+ entry += f" — {details}"
9
+ return entry
repo_templates/task1/variant_3/tests/test_inventory.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from src.inventory import check_stock, restock, get_low_stock_items
3
+
4
+
5
+ def test_in_stock():
6
+ inv = {"apple": 10, "banana": 5}
7
+ assert check_stock("apple", inv) == True
8
+
9
+
10
+ def test_out_of_stock():
11
+ inv = {"apple": 0}
12
+ # FAILS — returns True because >= 0 is wrong, should be > 0
13
+ assert check_stock("apple", inv) == False
14
+
15
+
16
+ def test_item_not_found():
17
+ assert check_stock("ghost", {}) == False
18
+
19
+
20
+ def test_restock_existing():
21
+ inv = {"apple": 5}
22
+ result = restock("apple", 3, inv)
23
+ assert result["apple"] == 8
24
+
25
+
26
+ def test_restock_new():
27
+ inv = {}
28
+ result = restock("orange", 10, inv)
29
+ assert result["orange"] == 10
30
+
31
+
32
+ def test_restock_negative():
33
+ with pytest.raises(ValueError):
34
+ restock("apple", -1, {})
35
+
36
+
37
+ def test_low_stock_items():
38
+ inv = {"apple": 3, "banana": 5, "cherry": 10}
39
+ # FAILS — banana (qty=5) should NOT be in low stock when threshold=5
40
+ # but <= threshold incorrectly includes items AT the threshold
41
+ result = get_low_stock_items(inv, threshold=5)
42
+ assert "apple" in result
43
+ assert "banana" not in result
44
+ assert "cherry" not in result
repo_templates/task1/variant_4/meta.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "variant_id": "task1_v4",
3
+ "task": "task1",
4
+ "bug_files": ["src/scheduler.py"],
5
+ "bug_description": "is_available uses <= instead of < for adjacent slot check; days_until has off-by-one (+1)",
6
+ "failing_tests": ["test_adjacent_slots_allowed", "test_days_until", "test_days_until_same_day"],
7
+ "correct_lines": {
8
+ "src/scheduler.py": {
9
+ "if start <= slot_end and end >= slot_start:": "if start < slot_end and end > slot_start:",
10
+ "return delta.days + 1": "return delta.days"
11
+ }
12
+ },
13
+ "total_files": 3,
14
+ "optimal_steps": 4
15
+ }
repo_templates/task1/variant_4/src/scheduler.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Meeting and event scheduler module."""
2
+ from datetime import datetime, timedelta
3
+
4
+
5
+ def is_available(start: datetime, end: datetime, booked_slots: list) -> bool:
6
+ """Check if a time slot is available (no overlap with booked slots)."""
7
+ for slot in booked_slots:
8
+ slot_start = slot["start"]
9
+ slot_end = slot["end"]
10
+ # BUG: off-by-one — should be < not <= for end comparison
11
+ # Adjacent meetings (one ends exactly when another starts) should be allowed
12
+ if start <= slot_end and end >= slot_start:
13
+ return False
14
+ return True
15
+
16
+
17
+ def get_next_available(after: datetime, duration_minutes: int, booked_slots: list) -> datetime:
18
+ """Find the next available slot after the given time."""
19
+ candidate = after
20
+ for _ in range(100): # safety limit
21
+ candidate_end = candidate + timedelta(minutes=duration_minutes)
22
+ if is_available(candidate, candidate_end, booked_slots):
23
+ return candidate
24
+ candidate += timedelta(minutes=15) # check in 15-minute increments
25
+ return None
26
+
27
+
28
+ def days_until(target: datetime, now: datetime = None) -> int:
29
+ """Calculate whole days until target date."""
30
+ if now is None:
31
+ now = datetime.now()
32
+ delta = target - now
33
+ # BUG: should return delta.days, not delta.days + 1
34
+ return delta.days + 1
repo_templates/task1/variant_4/src/time_helpers.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Time helper functions."""
2
+ from datetime import datetime
3
+
4
+
5
+ def format_time(dt: datetime) -> str:
6
+ """Format datetime to string."""
7
+ return dt.strftime("%Y-%m-%d %H:%M")
8
+
9
+
10
+ def parse_time(s: str) -> datetime:
11
+ """Parse string to datetime."""
12
+ return datetime.strptime(s, "%Y-%m-%d %H:%M")
repo_templates/task1/variant_4/tests/test_scheduler.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from datetime import datetime, timedelta
3
+ from src.scheduler import is_available, get_next_available, days_until
4
+
5
+
6
+ def test_slot_available():
7
+ booked = [
8
+ {"start": datetime(2024, 1, 1, 10, 0), "end": datetime(2024, 1, 1, 11, 0)}
9
+ ]
10
+ assert is_available(
11
+ datetime(2024, 1, 1, 12, 0),
12
+ datetime(2024, 1, 1, 13, 0),
13
+ booked
14
+ ) == True
15
+
16
+
17
+ def test_slot_overlap():
18
+ booked = [
19
+ {"start": datetime(2024, 1, 1, 10, 0), "end": datetime(2024, 1, 1, 11, 0)}
20
+ ]
21
+ assert is_available(
22
+ datetime(2024, 1, 1, 10, 30),
23
+ datetime(2024, 1, 1, 11, 30),
24
+ booked
25
+ ) == False
26
+
27
+
28
+ def test_adjacent_slots_allowed():
29
+ """Meeting starting exactly when another ends should be allowed."""
30
+ booked = [
31
+ {"start": datetime(2024, 1, 1, 10, 0), "end": datetime(2024, 1, 1, 11, 0)}
32
+ ]
33
+ # FAILS — returns False because <= is used instead of <
34
+ assert is_available(
35
+ datetime(2024, 1, 1, 11, 0),
36
+ datetime(2024, 1, 1, 12, 0),
37
+ booked
38
+ ) == True
39
+
40
+
41
+ def test_days_until():
42
+ now = datetime(2024, 1, 1, 0, 0)
43
+ target = datetime(2024, 1, 11, 0, 0)
44
+ # FAILS — returns 11 instead of 10 because of +1 bug
45
+ assert days_until(target, now) == 10
46
+
47
+
48
+ def test_days_until_same_day():
49
+ now = datetime(2024, 6, 15, 8, 0)
50
+ target = datetime(2024, 6, 15, 20, 0)
51
+ # FAILS — returns 1 instead of 0
52
+ assert days_until(target, now) == 0
repo_templates/task1/variant_5/meta.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "variant_id": "task1_v5",
3
+ "task": "task1",
4
+ "bug_files": ["src/formatter.py"],
5
+ "bug_description": "truncate doesn't account for ellipsis length; extract_between doesn't offset past start marker",
6
+ "failing_tests": ["test_truncate_long", "test_extract_between"],
7
+ "correct_lines": {
8
+ "src/formatter.py": {
9
+ "return text[:max_length] + \"...\"": "return text[:max_length - 3] + \"...\"",
10
+ "content_start = start_idx": "content_start = start_idx + len(start_marker)"
11
+ }
12
+ },
13
+ "total_files": 3,
14
+ "optimal_steps": 4
15
+ }
repo_templates/task1/variant_5/src/constants.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """Constants for the formatter module."""
2
+
3
+ DEFAULT_MAX_LENGTH = 50
4
+ ELLIPSIS = "..."
repo_templates/task1/variant_5/src/formatter.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Text formatter module for processing and formatting strings."""
2
+
3
+
4
+ def truncate(text: str, max_length: int) -> str:
5
+ """Truncate text to max_length, adding '...' if truncated."""
6
+ if not text:
7
+ return ""
8
+ if len(text) <= max_length:
9
+ return text
10
+ # BUG: should be text[:max_length - 3] + "..." to account for ellipsis length
11
+ return text[:max_length] + "..."
12
+
13
+
14
+ def extract_between(text: str, start_marker: str, end_marker: str) -> str:
15
+ """Extract text between two markers."""
16
+ start_idx = text.find(start_marker)
17
+ if start_idx == -1:
18
+ return ""
19
+ # BUG: should start after the marker, i.e. start_idx + len(start_marker)
20
+ content_start = start_idx # wrong — includes the start_marker itself
21
+ end_idx = text.find(end_marker, content_start)
22
+ if end_idx == -1:
23
+ return ""
24
+ return text[content_start:end_idx]
25
+
26
+
27
+ def capitalize_words(text: str) -> str:
28
+ """Capitalize the first letter of every word."""
29
+ return " ".join(w.capitalize() for w in text.split())
repo_templates/task1/variant_5/tests/test_formatter.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from src.formatter import truncate, extract_between, capitalize_words
3
+
4
+
5
+ def test_truncate_short():
6
+ assert truncate("hello", 10) == "hello"
7
+
8
+
9
+ def test_truncate_long():
10
+ # FAILS — returns "hello worl..." (13 chars) instead of "hello w..." (10 chars)
11
+ result = truncate("hello world", 10)
12
+ assert len(result) <= 10
13
+ assert result == "hello w..."
14
+
15
+
16
+ def test_truncate_empty():
17
+ assert truncate("", 5) == ""
18
+
19
+
20
+ def test_extract_between():
21
+ text = "start[CONTENT]end"
22
+ # FAILS — returns "[CONTENT]" instead of "CONTENT" because start_idx not offset
23
+ assert extract_between(text, "[", "]") == "CONTENT"
24
+
25
+
26
+ def test_extract_missing_marker():
27
+ assert extract_between("no markers here", "[", "]") == ""
28
+
29
+
30
+ def test_capitalize_words():
31
+ assert capitalize_words("hello world foo") == "Hello World Foo"
32
+
33
+
34
+ def test_capitalize_single():
35
+ assert capitalize_words("test") == "Test"
repo_templates/task2/variant_1/meta.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "variant_id": "task2_v1",
3
+ "task": "task2",
4
+ "bug_files": ["src/data_pipeline.py"],
5
+ "interface_files": ["src/validator.py"],
6
+ "bug_description": "data_pipeline passes str(record_id) but validator.py expects int",
7
+ "failing_tests": ["test_process_valid_batch"],
8
+ "fix_file": "src/data_pipeline.py",
9
+ "fix_description": "Remove str() wrapping — pass record['id'] directly",
10
+ "regression_test_must_cover": "TypeError raised when string is passed to validate_record",
11
+ "total_files": 4,
12
+ "optimal_steps": 6
13
+ }
repo_templates/task2/variant_1/src/data_pipeline.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.validator import validate_record
2
+
3
+
4
+ def process_batch(records: list) -> list:
5
+ """Process a batch of records through the validation pipeline."""
6
+ results = []
7
+ for record in records:
8
+ # BUG: passing record["id"] as string, but validate_record expects int
9
+ validated = validate_record(str(record["id"]), record["data"])
10
+ if validated:
11
+ results.append(validated)
12
+ return results
repo_templates/task2/variant_1/src/models.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ """Data models for the pipeline."""
2
+
3
+
4
+ class Record:
5
+ def __init__(self, record_id: int, data: dict):
6
+ self.record_id = record_id
7
+ self.data = data
8
+
9
+ def to_dict(self) -> dict:
10
+ return {"id": self.record_id, "data": self.data}
repo_templates/task2/variant_1/src/validator.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ def validate_record(record_id: int, data: dict) -> dict:
2
+ """Validate a record. record_id must be a positive integer."""
3
+ if not isinstance(record_id, int):
4
+ raise TypeError(f"record_id must be int, got {type(record_id)}")
5
+ if record_id <= 0:
6
+ return None
7
+ return {"id": record_id, "data": data, "valid": True}
repo_templates/task2/variant_1/tests/test_pipeline.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from src.data_pipeline import process_batch
3
+
4
+
5
+ def test_process_valid_batch():
6
+ records = [{"id": 1, "data": {"name": "test"}}, {"id": 2, "data": {"name": "test2"}}]
7
+ result = process_batch(records)
8
+ assert len(result) == 2 # FAILS — TypeError from wrong type
9
+
10
+
11
+ def test_process_with_invalid_id():
12
+ records = [{"id": -1, "data": {"name": "bad"}}]
13
+ result = process_batch(records)
14
+ assert result == []
15
+
16
+
17
+ def test_empty_batch():
18
+ assert process_batch([]) == []
repo_templates/task2/variant_2/meta.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "variant_id": "task2_v2",
3
+ "task": "task2",
4
+ "bug_files": ["src/email_sender.py"],
5
+ "interface_files": ["src/template_engine.py"],
6
+ "bug_description": "email_sender passes name= kwarg but template_engine expects username=",
7
+ "failing_tests": ["test_send_welcome_email", "test_welcome_email_structure"],
8
+ "fix_file": "src/email_sender.py",
9
+ "fix_description": "Change name=user_name to username=user_name in send_welcome_email",
10
+ "regression_test_must_cover": "KeyError when wrong kwarg name is used",
11
+ "total_files": 4,
12
+ "optimal_steps": 6
13
+ }
repo_templates/task2/variant_2/src/config.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """Configuration for the email service."""
2
+
3
+ SMTP_HOST = "localhost"
4
+ SMTP_PORT = 587
5
+ FROM_EMAIL = "noreply@example.com"
repo_templates/task2/variant_2/src/email_sender.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Email sending service that uses the template engine."""
2
+ from src.template_engine import render_template
3
+
4
+
5
+ def send_welcome_email(user_name: str, user_email: str) -> dict:
6
+ """Send a welcome email to a new user."""
7
+ # BUG: passing 'name' but template_engine expects 'username'
8
+ body = render_template("welcome", name=user_name, email=user_email)
9
+ return {
10
+ "to": user_email,
11
+ "subject": "Welcome!",
12
+ "body": body,
13
+ "sent": True,
14
+ }
15
+
16
+
17
+ def send_reset_email(user_email: str, reset_link: str) -> dict:
18
+ """Send a password reset email."""
19
+ body = render_template("reset", email=user_email, link=reset_link)
20
+ return {
21
+ "to": user_email,
22
+ "subject": "Password Reset",
23
+ "body": body,
24
+ "sent": True,
25
+ }
repo_templates/task2/variant_2/src/template_engine.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Template rendering engine for email bodies."""
2
+
3
+ TEMPLATES = {
4
+ "welcome": "Hello {username}, welcome to our platform! Your email {email} has been registered.",
5
+ "reset": "Click here to reset your password: {link}. This was requested for {email}.",
6
+ "notify": "Hi {username}, you have a new notification: {message}.",
7
+ }
8
+
9
+
10
+ def render_template(template_name: str, **kwargs) -> str:
11
+ """
12
+ Render an email template with the given keyword arguments.
13
+
14
+ Expected kwargs per template:
15
+ - welcome: username (str), email (str)
16
+ - reset: email (str), link (str)
17
+ - notify: username (str), message (str)
18
+ """
19
+ if template_name not in TEMPLATES:
20
+ raise ValueError(f"Unknown template: {template_name}")
21
+
22
+ template = TEMPLATES[template_name]
23
+ try:
24
+ return template.format(**kwargs)
25
+ except KeyError as e:
26
+ raise KeyError(f"Missing required template variable: {e}")
repo_templates/task2/variant_2/tests/test_email.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from src.email_sender import send_welcome_email, send_reset_email
3
+
4
+
5
+ def test_send_welcome_email():
6
+ # FAILS — KeyError because email_sender passes 'name' but template expects 'username'
7
+ result = send_welcome_email("Alice", "alice@example.com")
8
+ assert result["sent"] == True
9
+ assert "Alice" in result["body"]
10
+ assert "alice@example.com" in result["body"]
11
+
12
+
13
+ def test_send_reset_email():
14
+ result = send_reset_email("bob@example.com", "https://reset.link/abc")
15
+ assert result["sent"] == True
16
+ assert "https://reset.link/abc" in result["body"]
17
+
18
+
19
+ def test_welcome_email_structure():
20
+ # FAILS — same KeyError as test_send_welcome_email
21
+ result = send_welcome_email("Charlie", "charlie@test.com")
22
+ assert result["to"] == "charlie@test.com"
23
+ assert result["subject"] == "Welcome!"
repo_templates/task2/variant_3/meta.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "variant_id": "task2_v3",
3
+ "task": "task2",
4
+ "bug_files": ["src/order_processor.py"],
5
+ "interface_files": ["src/inventory_checker.py"],
6
+ "bug_description": "order_processor passes list of items but inventory_checker expects dict {sku: qty}",
7
+ "failing_tests": ["test_process_valid_order", "test_order_structure"],
8
+ "fix_file": "src/order_processor.py",
9
+ "fix_description": "Convert items list to dict: {item['sku']: item['qty'] for item in items}",
10
+ "regression_test_must_cover": "TypeError when list is passed to check_availability",
11
+ "total_files": 4,
12
+ "optimal_steps": 6
13
+ }
repo_templates/task2/variant_3/src/inventory_checker.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Inventory checking service. Verifies stock levels for orders."""
2
+
3
+ # Simulated stock database
4
+ STOCK = {
5
+ "WIDGET-A": 100,
6
+ "WIDGET-B": 50,
7
+ "GADGET-X": 0,
8
+ "GADGET-Y": 25,
9
+ }
10
+
11
+
12
+ def check_availability(requested_items: dict) -> bool:
13
+ """
14
+ Check if all requested items are available in stock.
15
+
16
+ Args:
17
+ requested_items: dict mapping SKU to quantity, e.g. {"WIDGET-A": 5, "GADGET-Y": 2}
18
+
19
+ Returns:
20
+ True if all items are available in sufficient quantity.
21
+ """
22
+ if not isinstance(requested_items, dict):
23
+ raise TypeError(
24
+ f"requested_items must be dict, got {type(requested_items).__name__}. "
25
+ f"Expected format: {{'SKU': quantity}}"
26
+ )
27
+
28
+ for sku, qty in requested_items.items():
29
+ if sku not in STOCK:
30
+ return False
31
+ if STOCK[sku] < qty:
32
+ return False
33
+ return True
repo_templates/task2/variant_3/src/models.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ """Shared models for the order system."""
2
+
3
+
4
+ class OrderItem:
5
+ def __init__(self, sku: str, qty: int):
6
+ self.sku = sku
7
+ self.qty = qty
8
+
9
+ def to_dict(self) -> dict:
10
+ return {"sku": self.sku, "qty": self.qty}
repo_templates/task2/variant_3/src/order_processor.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Order processing module that checks inventory before fulfillment."""
2
+ from src.inventory_checker import check_availability
3
+
4
+
5
+ def process_order(order: dict) -> dict:
6
+ """
7
+ Process an order by checking inventory availability.
8
+ order format: {"items": [{"sku": "ABC", "qty": 2}, ...], "customer": "..."}
9
+ """
10
+ items = order.get("items", [])
11
+ if not items:
12
+ return {"status": "error", "message": "No items in order"}
13
+
14
+ # BUG: passing items as list, but check_availability expects a dict {sku: qty}
15
+ available = check_availability(items)
16
+
17
+ if available:
18
+ return {"status": "confirmed", "items": items}
19
+ else:
20
+ return {"status": "out_of_stock", "items": items}
repo_templates/task2/variant_3/tests/test_orders.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from src.order_processor import process_order
3
+
4
+
5
+ def test_process_valid_order():
6
+ order = {
7
+ "items": [{"sku": "WIDGET-A", "qty": 2}, {"sku": "GADGET-Y", "qty": 1}],
8
+ "customer": "alice@example.com",
9
+ }
10
+ # FAILS — TypeError because list is passed instead of dict
11
+ result = process_order(order)
12
+ assert result["status"] == "confirmed"
13
+
14
+
15
+ def test_empty_order():
16
+ result = process_order({"items": [], "customer": "bob@example.com"})
17
+ assert result["status"] == "error"
18
+
19
+
20
+ def test_order_structure():
21
+ order = {
22
+ "items": [{"sku": "WIDGET-B", "qty": 5}],
23
+ "customer": "charlie@example.com",
24
+ }
25
+ # FAILS — same TypeError
26
+ result = process_order(order)
27
+ assert "items" in result
repo_templates/task2/variant_4/meta.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "variant_id": "task2_v4",
3
+ "task": "task2",
4
+ "bug_files": ["src/report_builder.py"],
5
+ "interface_files": ["src/date_formatter.py"],
6
+ "bug_description": "report_builder passes ISO string but date_formatter expects datetime object",
7
+ "failing_tests": ["test_build_monthly_report", "test_report_structure"],
8
+ "fix_file": "src/report_builder.py",
9
+ "fix_description": "Parse ISO strings to datetime before passing: datetime.strptime(start_date, '%Y-%m-%d')",
10
+ "regression_test_must_cover": "TypeError when string is passed to format_date_range",
11
+ "total_files": 4,
12
+ "optimal_steps": 6
13
+ }
repo_templates/task2/variant_4/src/date_formatter.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Date formatting utilities for reports."""
2
+ from datetime import datetime
3
+
4
+
5
+ def format_date_range(start: datetime, end: datetime) -> str:
6
+ """
7
+ Format a date range for display in reports.
8
+
9
+ Args:
10
+ start: datetime object for range start
11
+ end: datetime object for range end
12
+
13
+ Returns:
14
+ Formatted string like "Jan 01, 2024 — Jan 31, 2024"
15
+ """
16
+ if not isinstance(start, datetime):
17
+ raise TypeError(f"start must be datetime, got {type(start).__name__}")
18
+ if not isinstance(end, datetime):
19
+ raise TypeError(f"end must be datetime, got {type(end).__name__}")
20
+
21
+ return f"{start.strftime('%b %d, %Y')} — {end.strftime('%b %d, %Y')}"
22
+
23
+
24
+ def format_single_date(dt: datetime) -> str:
25
+ """Format a single date."""
26
+ if not isinstance(dt, datetime):
27
+ raise TypeError(f"Expected datetime, got {type(dt).__name__}")
28
+ return dt.strftime("%B %d, %Y")
repo_templates/task2/variant_4/src/models.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """Shared models for the reporting system."""
2
+
3
+ REPORT_TYPES = ["monthly", "quarterly", "annual", "summary"]
repo_templates/task2/variant_4/src/report_builder.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Report builder that assembles reports with formatted dates."""
2
+ from src.date_formatter import format_date_range
3
+
4
+
5
+ def build_monthly_report(title: str, start_date: str, end_date: str, data: list) -> dict:
6
+ """
7
+ Build a monthly report with formatted date header.
8
+
9
+ Args:
10
+ title: Report title
11
+ start_date: ISO format string 'YYYY-MM-DD'
12
+ end_date: ISO format string 'YYYY-MM-DD'
13
+ data: List of data points
14
+ """
15
+ # BUG: passing ISO string directly, but format_date_range expects datetime objects
16
+ date_header = format_date_range(start_date, end_date)
17
+
18
+ return {
19
+ "title": title,
20
+ "period": date_header,
21
+ "total_records": len(data),
22
+ "data": data,
23
+ }
24
+
25
+
26
+ def build_summary(title: str, content: str) -> dict:
27
+ """Build a simple summary report."""
28
+ return {"title": title, "content": content, "type": "summary"}
repo_templates/task2/variant_4/tests/test_reports.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from src.report_builder import build_monthly_report, build_summary
3
+
4
+
5
+ def test_build_monthly_report():
6
+ # FAILS — TypeError because ISO string passed instead of datetime
7
+ result = build_monthly_report(
8
+ "Sales Report",
9
+ "2024-01-01",
10
+ "2024-01-31",
11
+ [{"amount": 100}, {"amount": 200}],
12
+ )
13
+ assert result["title"] == "Sales Report"
14
+ assert result["total_records"] == 2
15
+ assert "Jan" in result["period"]
16
+
17
+
18
+ def test_build_summary():
19
+ result = build_summary("Q1 Summary", "Revenue increased 15%")
20
+ assert result["title"] == "Q1 Summary"
21
+ assert result["type"] == "summary"
22
+
23
+
24
+ def test_report_structure():
25
+ # FAILS — same TypeError
26
+ result = build_monthly_report("Inventory", "2024-03-01", "2024-03-31", [])
27
+ assert "period" in result
28
+ assert result["total_records"] == 0
repo_templates/task2/variant_5/meta.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "variant_id": "task2_v5",
3
+ "task": "task2",
4
+ "bug_files": ["src/cache_manager.py"],
5
+ "interface_files": ["src/serializer.py"],
6
+ "bug_description": "cache_manager passes bytes (.encode()) but serializer expects str",
7
+ "failing_tests": ["test_cache_set_and_get", "test_cache_delete"],
8
+ "fix_file": "src/cache_manager.py",
9
+ "fix_description": "Remove .encode('utf-8') — pass str(value) directly to serialize_value",
10
+ "regression_test_must_cover": "TypeError when bytes is passed to serialize_value",
11
+ "total_files": 4,
12
+ "optimal_steps": 6
13
+ }
repo_templates/task2/variant_5/src/cache_manager.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Cache management service that stores serialized data."""
2
+ from src.serializer import serialize_value, deserialize_value
3
+
4
+
5
+ class CacheManager:
6
+ """Simple in-memory cache with serialization."""
7
+
8
+ def __init__(self):
9
+ self._store = {}
10
+
11
+ def set(self, key: str, value) -> None:
12
+ """Store a value in the cache after serializing it."""
13
+ # BUG: passing bytes (encoded) instead of str to serialize_value
14
+ serialized = serialize_value(str(value).encode('utf-8'))
15
+ self._store[key] = serialized
16
+
17
+ def get(self, key: str, default=None):
18
+ """Retrieve and deserialize a value from cache."""
19
+ if key not in self._store:
20
+ return default
21
+ return deserialize_value(self._store[key])
22
+
23
+ def delete(self, key: str) -> bool:
24
+ """Remove a key from cache."""
25
+ if key in self._store:
26
+ del self._store[key]
27
+ return True
28
+ return False
29
+
30
+ def clear(self):
31
+ """Clear all cached values."""
32
+ self._store.clear()
33
+
34
+ def keys(self) -> list:
35
+ """Return all cache keys."""
36
+ return list(self._store.keys())
repo_templates/task2/variant_5/src/config.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """Cache configuration constants."""
2
+
3
+ MAX_CACHE_SIZE = 1000
4
+ DEFAULT_TTL = 300 # seconds
repo_templates/task2/variant_5/src/serializer.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Serialization utilities for the cache system."""
2
+ import json
3
+
4
+
5
+ def serialize_value(value: str) -> str:
6
+ """
7
+ Serialize a value to a JSON string for storage.
8
+
9
+ Args:
10
+ value: must be a string (str type)
11
+
12
+ Returns:
13
+ JSON-encoded string
14
+ """
15
+ if not isinstance(value, str):
16
+ raise TypeError(f"value must be str, got {type(value).__name__}")
17
+ return json.dumps({"data": value})
18
+
19
+
20
+ def deserialize_value(serialized: str):
21
+ """Deserialize a JSON string back to the original value."""
22
+ if not isinstance(serialized, str):
23
+ raise TypeError(f"serialized must be str, got {type(serialized).__name__}")
24
+ result = json.loads(serialized)
25
+ return result.get("data")
repo_templates/task2/variant_5/tests/test_cache.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from src.cache_manager import CacheManager
3
+
4
+
5
+ def test_cache_set_and_get():
6
+ cache = CacheManager()
7
+ # FAILS — TypeError because bytes passed to serializer instead of str
8
+ cache.set("user:1", "Alice")
9
+ assert cache.get("user:1") == "Alice"
10
+
11
+
12
+ def test_cache_get_missing():
13
+ cache = CacheManager()
14
+ assert cache.get("nonexistent", "default") == "default"
15
+
16
+
17
+ def test_cache_delete():
18
+ cache = CacheManager()
19
+ # FAILS — same TypeError on set
20
+ cache.set("temp", "data")
21
+ assert cache.delete("temp") == True
22
+ assert cache.get("temp") is None
23
+
24
+
25
+ def test_cache_clear():
26
+ cache = CacheManager()
27
+ cache._store["a"] = '{"data": "1"}'
28
+ cache._store["b"] = '{"data": "2"}'
29
+ cache.clear()
30
+ assert cache.keys() == []
31
+
32
+
33
+ def test_cache_keys():
34
+ cache = CacheManager()
35
+ cache._store["x"] = '{"data": "1"}'
36
+ cache._store["y"] = '{"data": "2"}'
37
+ assert sorted(cache.keys()) == ["x", "y"]