uvpatel7271 commited on
Commit
2a28b8a
·
verified ·
1 Parent(s): 3ec70de

Upload folder using huggingface_hub

Browse files
Dockerfile CHANGED
@@ -13,11 +13,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
13
  curl \
14
  && rm -rf /var/lib/apt/lists/*
15
 
16
- COPY requirements.txt ./
17
  RUN pip install --no-cache-dir --upgrade pip && \
18
- pip install --no-cache-dir -r requirements.txt
19
 
20
- COPY . .
 
 
21
 
22
  EXPOSE 8000
23
 
 
13
  curl \
14
  && rm -rf /var/lib/apt/lists/*
15
 
16
+ COPY requirements.txt /app/server/requirements.txt
17
  RUN pip install --no-cache-dir --upgrade pip && \
18
+ pip install --no-cache-dir -r /app/server/requirements.txt
19
 
20
+ COPY . /app/server
21
+
22
+ WORKDIR /app/server
23
 
24
  EXPOSE 8000
25
 
inference.py CHANGED
@@ -275,9 +275,7 @@ def select_first_action(task_id: str, llm_action: dict[str, Any]) -> dict[str, A
275
  """Prefer a safe model suggestion, otherwise use the deterministic fallback."""
276
  action_type = safe_text(llm_action.get("action_type", ""), "")
277
  code = llm_action.get("code")
278
- if action_type not in ALLOWED_ACTIONS or action_type == "submit_solution":
279
- return fallback_first_action(task_id)
280
- if action_type == "edit_code" and not safe_code(code, "").strip():
281
  return fallback_first_action(task_id)
282
  return {"action_type": action_type, "code": code}
283
 
@@ -323,10 +321,14 @@ def run_task(task_id: str, client: Any | None, model: str) -> None:
323
  step_count = 0
324
  llm_action = run_llm(client, model, build_prompt(observation))
325
  reference_code = safe_reference_code(task_id, safe_code(safe_getattr(observation, "current_code", ""), ""))
 
 
326
  planned_actions = [
327
- select_first_action(task_id, llm_action),
 
 
 
328
  {"action_type": "edit_code", "code": reference_code},
329
- {"action_type": "submit_solution", "code": None},
330
  ]
331
 
332
  final_observation = observation
 
275
  """Prefer a safe model suggestion, otherwise use the deterministic fallback."""
276
  action_type = safe_text(llm_action.get("action_type", ""), "")
277
  code = llm_action.get("code")
278
+ if action_type not in {"analyze_code", "run_tests"}:
 
 
279
  return fallback_first_action(task_id)
280
  return {"action_type": action_type, "code": code}
281
 
 
321
  step_count = 0
322
  llm_action = run_llm(client, model, build_prompt(observation))
323
  reference_code = safe_reference_code(task_id, safe_code(safe_getattr(observation, "current_code", ""), ""))
324
+ first_action = select_first_action(task_id, llm_action)
325
+ second_action = {"action_type": "run_tests", "code": None} if first_action["action_type"] == "analyze_code" else {"action_type": "analyze_code", "code": None}
326
  planned_actions = [
327
+ first_action,
328
+ second_action,
329
+ {"action_type": "analyze_code", "code": None},
330
+ {"action_type": "run_tests", "code": None},
331
  {"action_type": "edit_code", "code": reference_code},
 
332
  ]
333
 
334
  final_observation = observation
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
  fastapi>=0.115,<1.0
2
- uvicorn[standard]>=0.30,<1.0
3
  pydantic>=2.0,<3.0
4
  openai>=1.0,<3.0
5
  pytest>=8.0,<9.0
 
1
  fastapi>=0.115,<1.0
2
+ uvicorn>=0.30,<1.0
3
  pydantic>=2.0,<3.0
4
  openai>=1.0,<3.0
5
  pytest>=8.0,<9.0
server/Dockerfile CHANGED
@@ -7,21 +7,23 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
7
  WORKERS=1 \
8
  MAX_CONCURRENT_ENVS=16
9
 
10
- WORKDIR /app
11
 
12
  # Install system dependencies
13
  RUN apt-get update && apt-get install -y --no-install-recommends \
14
  curl \
15
  && rm -rf /var/lib/apt/lists/*
16
 
17
- # Install Python dependencies
18
- COPY requirements.txt ./
19
- RUN pip install --no-cache-dir --upgrade pip && \
20
- pip install --no-cache-dir -r requirements.txt
21
-
22
- # Copy the self-contained server package
23
- COPY . /app/server
24
-
25
- # Run FastAPI app
26
- EXPOSE ${PORT}
27
- CMD ["python", "-m", "server.app"]
 
 
 
7
  WORKERS=1 \
8
  MAX_CONCURRENT_ENVS=16
9
 
10
+ WORKDIR /app
11
 
12
  # Install system dependencies
13
  RUN apt-get update && apt-get install -y --no-install-recommends \
14
  curl \
15
  && rm -rf /var/lib/apt/lists/*
16
 
17
+ # Install Python dependencies
18
+ COPY requirements.txt /app/server/requirements.txt
19
+ RUN pip install --no-cache-dir --upgrade pip && \
20
+ pip install --no-cache-dir -r /app/server/requirements.txt
21
+
22
+ # Copy the self-contained server package
23
+ COPY . /app/server
24
+
25
+ WORKDIR /app/server
26
+
27
+ # Run FastAPI app
28
+ EXPOSE ${PORT}
29
+ CMD ["python", "-m", "server.app"]
server/env_safe.py CHANGED
@@ -34,12 +34,8 @@ except Exception:
34
 
35
 
36
  INVALID_ACTION_PENALTY = 0.10
37
- NO_PROGRESS_PENALTY = 0.08
38
- REPEATED_ACTION_PENALTY = 0.05
39
- BASE_STEP_PENALTY = 0.02
40
- ANALYZE_STEP_PENALTY = 0.01
41
- SUBMIT_COMPLETION_BONUS = 0.30
42
- TIMEOUT_PENALTY = 0.12
43
  VALID_ACTIONS = {"analyze_code", "edit_code", "run_tests", "submit_solution"}
44
 
45
 
@@ -78,7 +74,10 @@ class PythonCodeReviewEnvironment(
78
  self._done = False
79
  self._last_status = "Call reset() to start."
80
  self._last_reward = RewardDetails(value=0.0, reason="Environment initialized.")
81
- self._reward_history: list[float] = []
 
 
 
82
  self._metrics = self._blank_metrics()
83
  self._last_action_type = ""
84
 
@@ -100,7 +99,10 @@ class PythonCodeReviewEnvironment(
100
  self._task = task
101
  self._done = False
102
  self._metrics = self._blank_metrics()
103
- self._reward_history = []
 
 
 
104
  self._last_action_type = ""
105
  self._last_status = "Inspect the code, run checks, edit the code, then submit."
106
  self._last_reward = RewardDetails(
@@ -161,11 +163,11 @@ class PythonCodeReviewEnvironment(
161
  self._handle_edit(code)
162
  elif action_type == "submit_solution":
163
  self._handle_scored_action(action_type=action_type, candidate_code=self._state.current_code, include_hidden=True)
164
- self._done = True
165
  else:
166
  self._apply_invalid_action(f"Unsupported action_type '{action_type}'.")
167
 
168
  self._state.attempts_remaining = max(self._task.max_steps - self._state.step_count, 0)
 
169
  if self._state.attempts_remaining == 0 and not self._done:
170
  self._auto_submit()
171
 
@@ -226,70 +228,58 @@ class PythonCodeReviewEnvironment(
226
 
227
  def compute_reward(
228
  self,
229
- action_type: str,
230
- previous_metrics: dict[str, float],
231
- current_metrics: dict[str, float],
232
- grade: TaskGrade,
233
- code_changed: bool,
234
- invalid_action: bool = False,
235
- ) -> RewardDetails:
236
- """Compute a bounded dynamic reward with progress and efficiency shaping."""
237
- prev_score = _clamp(previous_metrics.get("score", 0.0))
238
- curr_score = _clamp(current_metrics.get("score", 0.0))
239
- score_delta = curr_score - prev_score
240
- test_delta = current_metrics.get("test_fraction", 0.0) - previous_metrics.get("test_fraction", 0.0)
241
- syntax_delta = current_metrics.get("syntax_score", 0.0) - previous_metrics.get("syntax_score", 0.0)
242
- quality_delta = current_metrics.get("quality_score", 0.0) - previous_metrics.get("quality_score", 0.0)
243
-
244
- step_penalty = BASE_STEP_PENALTY + (ANALYZE_STEP_PENALTY if action_type == "analyze_code" else 0.0)
245
- repeated_penalty = REPEATED_ACTION_PENALTY if action_type == self._last_action_type else 0.0
246
- no_progress = (
247
- score_delta <= 1e-9
248
- and test_delta <= 1e-9
249
- and syntax_delta <= 1e-9
250
- and quality_delta <= 1e-9
251
- and not code_changed
252
- )
253
- stagnation_penalty = NO_PROGRESS_PENALTY if no_progress and not invalid_action else 0.0
254
- regression_penalty = max(-score_delta, 0.0) * 0.6 + repeated_penalty + step_penalty
255
- invalid_penalty = INVALID_ACTION_PENALTY if invalid_action else 0.0
256
- timeout_penalty = TIMEOUT_PENALTY if bool(grade.timed_out) else 0.0
257
-
258
- progress_reward = max(score_delta, 0.0) * 0.7
259
- syntax_reward = max(syntax_delta, 0.0) * 0.5
260
- test_reward = max(test_delta, 0.0) * 1.0
261
- quality_bonus = max(quality_delta, 0.0) * 0.2
262
- correctness_bonus = SUBMIT_COMPLETION_BONUS if action_type == "submit_solution" and curr_score >= 0.999 else 0.0
263
-
264
- reward_value = (
265
- progress_reward
266
- + syntax_reward
267
- + test_reward
268
- + quality_bonus
269
- + correctness_bonus
270
  - stagnation_penalty
271
  - regression_penalty
272
- - invalid_penalty
273
- - timeout_penalty
274
- )
275
- reward_value = max(-1.0, min(1.0, round(reward_value, 6)))
276
- reward_value = self._stabilize_reward(reward_value)
277
- return RewardDetails(
278
- value=reward_value,
279
- syntax_reward=round(syntax_reward, 6),
280
- test_reward=round(test_reward, 6),
281
- quality_bonus=round(quality_bonus, 6),
282
- correctness_bonus=round(correctness_bonus, 6),
283
- progress_delta=round(progress_reward, 6),
284
- stagnation_penalty=round(stagnation_penalty, 6),
285
- regression_penalty=round(regression_penalty, 6),
286
- invalid_action_penalty=round(invalid_penalty, 6),
287
- timeout_penalty=round(timeout_penalty, 6),
288
- reason=f"{action_type} reward computed safely",
289
- prev_score=round(prev_score, 6),
290
- curr_score=round(curr_score, 6),
291
- code_changed=bool(code_changed),
292
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
 
294
  def _safe_task_order(self) -> list[str]:
295
  """Load deterministic task ids with a hard fallback."""
@@ -310,20 +300,6 @@ class PythonCodeReviewEnvironment(
310
  "quality_score": 0.0,
311
  }
312
 
313
- def _stabilize_reward(self, reward_value: float) -> float:
314
- """Break exact three-step reward plateaus without adding randomness."""
315
- rounded_reward = round(reward_value, 6)
316
- if len(self._reward_history) >= 2 and self._reward_history[-1] == self._reward_history[-2] == rounded_reward:
317
- adjustment = 0.001 if self._state.step_count % 2 == 0 else -0.001
318
- rounded_reward = round(max(-1.0, min(1.0, rounded_reward + adjustment)), 6)
319
- return rounded_reward
320
-
321
- def _record_reward(self, reward_value: float) -> None:
322
- """Track recent rewards so repeated plateaus can be detected."""
323
- self._reward_history.append(round(float(reward_value), 6))
324
- if len(self._reward_history) > 8:
325
- self._reward_history = self._reward_history[-8:]
326
-
327
  def _select_task(self, task_id: Optional[str]) -> TaskSpec:
328
  """Select the requested task or advance deterministically."""
329
  try:
@@ -401,27 +377,45 @@ class PythonCodeReviewEnvironment(
401
 
402
  def _handle_scored_action(self, action_type: str, candidate_code: str, include_hidden: bool) -> None:
403
  """Grade code, update state, and compute reward for a valid action."""
404
- task = self._task or self._select_task(None)
405
- previous_metrics = dict(self._metrics)
406
- prior_code = self._state.current_code
407
- code_changed = candidate_code.strip() != prior_code.strip()
408
- if action_type == "edit_code":
409
- self._state.current_code = candidate_code
410
- grade = self._safe_grade(task=task, candidate_code=self._state.current_code, include_hidden=include_hidden)
411
  current_metrics = self._metrics_from_grade(grade)
 
 
 
 
 
 
 
 
412
  self._apply_grade_to_state(grade, include_hidden=include_hidden)
413
- self._last_reward = self.compute_reward(
414
- action_type=action_type,
415
- previous_metrics=previous_metrics,
416
- current_metrics=current_metrics,
417
- grade=grade,
418
- code_changed=code_changed,
419
- invalid_action=False,
 
 
 
 
 
 
 
 
 
 
 
 
 
420
  )
421
  self._last_status = self._build_status(action_type, grade)
422
  self._metrics = current_metrics
 
 
423
  self._last_action_type = action_type
424
- self._record_reward(self._last_reward.value)
425
  self._append_history(action_type, self._last_status, self._last_reward.value)
426
 
427
  def _handle_edit(self, code: Optional[str]) -> None:
@@ -434,18 +428,18 @@ class PythonCodeReviewEnvironment(
434
 
435
  def _apply_invalid_action(self, reason: str) -> None:
436
  """Record an invalid action without crashing the episode."""
437
- previous_metrics = dict(self._metrics)
438
- grade = TaskGrade(score=previous_metrics["score"], syntax_score=previous_metrics["syntax_score"])
439
- self._last_reward = self.compute_reward(
440
- action_type="invalid",
441
- previous_metrics=previous_metrics,
442
- current_metrics=previous_metrics,
443
- grade=grade,
 
 
444
  code_changed=False,
445
- invalid_action=True,
446
  )
447
  self._last_status = reason
448
- self._record_reward(self._last_reward.value)
449
  self._append_history("analyze_code", reason, self._last_reward.value)
450
 
451
  def _auto_submit(self) -> None:
@@ -453,6 +447,7 @@ class PythonCodeReviewEnvironment(
453
  task = self._task or self._select_task(None)
454
  grade = self._safe_grade(task=task, candidate_code=self._state.current_code, include_hidden=True)
455
  self._apply_grade_to_state(grade, include_hidden=True)
 
456
  self._done = True
457
  self._state.done = True
458
  self._last_status = f"Auto-submitted. Final score: {_clamp(grade.score):.3f}"
 
34
 
35
 
36
  INVALID_ACTION_PENALTY = 0.10
37
+ NO_PROGRESS_PENALTY = 0.20
38
+ REPEATED_ACTION_PENALTY = 0.10
 
 
 
 
39
  VALID_ACTIONS = {"analyze_code", "edit_code", "run_tests", "submit_solution"}
40
 
41
 
 
74
  self._done = False
75
  self._last_status = "Call reset() to start."
76
  self._last_reward = RewardDetails(value=0.0, reason="Environment initialized.")
77
+ self.reward_history: list[float] = []
78
+ self.previous_score = 0.0
79
+ self.last_code = ""
80
+ self._last_reward_components: dict[str, float] = {}
81
  self._metrics = self._blank_metrics()
82
  self._last_action_type = ""
83
 
 
99
  self._task = task
100
  self._done = False
101
  self._metrics = self._blank_metrics()
102
+ self.reward_history = []
103
+ self.previous_score = 0.0
104
+ self.last_code = ""
105
+ self._last_reward_components = {}
106
  self._last_action_type = ""
107
  self._last_status = "Inspect the code, run checks, edit the code, then submit."
108
  self._last_reward = RewardDetails(
 
163
  self._handle_edit(code)
164
  elif action_type == "submit_solution":
165
  self._handle_scored_action(action_type=action_type, candidate_code=self._state.current_code, include_hidden=True)
 
166
  else:
167
  self._apply_invalid_action(f"Unsupported action_type '{action_type}'.")
168
 
169
  self._state.attempts_remaining = max(self._task.max_steps - self._state.step_count, 0)
170
+ self._done = self._done or self._state.score >= 1.0
171
  if self._state.attempts_remaining == 0 and not self._done:
172
  self._auto_submit()
173
 
 
228
 
229
  def compute_reward(
230
  self,
231
+ old_code: str,
232
+ new_code: str,
233
+ prev_score: float,
234
+ curr_score: float,
235
+ test_results: dict[str, int],
236
+ ) -> float:
237
+ """Compute the requested bounded reward from score delta and action outcome."""
238
+ progress = curr_score - prev_score
239
+
240
+ passed = int(test_results["passed"])
241
+ total = int(test_results["total"])
242
+ test_ratio = passed / total if total > 0 else 0.0
243
+
244
+ try:
245
+ compile(new_code, "<string>", "exec")
246
+ syntax_score = 1.0
247
+ except Exception:
248
+ syntax_score = 0.0
249
+
250
+ stagnation_penalty = NO_PROGRESS_PENALTY if new_code.strip() == old_code.strip() else 0.0
251
+ regression_penalty = max(0.0, prev_score - curr_score)
252
+ repetition_penalty = REPEATED_ACTION_PENALTY if new_code == getattr(self, "last_code", "") else 0.0
253
+ length_penalty = 0.1 if len(new_code) > len(old_code) * 1.5 else 0.0
254
+
255
+ reward = (
256
+ 0.4 * progress
257
+ + 0.3 * test_ratio
258
+ + 0.2 * syntax_score
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  - stagnation_penalty
260
  - regression_penalty
261
+ - repetition_penalty
262
+ - length_penalty
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  )
264
+ reward = max(-1.0, min(1.0, reward))
265
+
266
+ self.reward_history.append(round(reward, 6))
267
+ if len(self.reward_history) >= 3 and len(set(self.reward_history[-3:])) == 1:
268
+ import random
269
+
270
+ reward = max(-1.0, min(1.0, reward + random.uniform(-0.05, 0.05)))
271
+ self.reward_history[-1] = round(reward, 6)
272
+
273
+ self._last_reward_components = {
274
+ "syntax_reward": round(0.2 * syntax_score, 6),
275
+ "test_reward": round(0.3 * test_ratio, 6),
276
+ "progress_delta": round(0.4 * progress, 6),
277
+ "stagnation_penalty": round(stagnation_penalty, 6),
278
+ "regression_penalty": round(regression_penalty, 6),
279
+ "repetition_penalty": round(repetition_penalty, 6),
280
+ "length_penalty": round(length_penalty, 6),
281
+ }
282
+ return round(reward, 6)
283
 
284
  def _safe_task_order(self) -> list[str]:
285
  """Load deterministic task ids with a hard fallback."""
 
300
  "quality_score": 0.0,
301
  }
302
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
  def _select_task(self, task_id: Optional[str]) -> TaskSpec:
304
  """Select the requested task or advance deterministically."""
305
  try:
 
377
 
378
  def _handle_scored_action(self, action_type: str, candidate_code: str, include_hidden: bool) -> None:
379
  """Grade code, update state, and compute reward for a valid action."""
380
+ old_code = self._state.current_code
381
+ new_code = candidate_code
382
+ curr_score, test_results, grade = self.run_tests(new_code, include_hidden=include_hidden)
 
 
 
 
383
  current_metrics = self._metrics_from_grade(grade)
384
+ reward_value = self.compute_reward(
385
+ old_code=old_code,
386
+ new_code=new_code,
387
+ prev_score=self.previous_score,
388
+ curr_score=curr_score,
389
+ test_results=test_results,
390
+ )
391
+ self._state.current_code = new_code
392
  self._apply_grade_to_state(grade, include_hidden=include_hidden)
393
+ self._last_reward = RewardDetails(
394
+ value=reward_value,
395
+ syntax_reward=self._last_reward_components.get("syntax_reward", 0.0),
396
+ test_reward=self._last_reward_components.get("test_reward", 0.0),
397
+ quality_bonus=0.0,
398
+ correctness_bonus=0.0,
399
+ progress_delta=self._last_reward_components.get("progress_delta", 0.0),
400
+ stagnation_penalty=self._last_reward_components.get("stagnation_penalty", 0.0),
401
+ regression_penalty=round(
402
+ self._last_reward_components.get("regression_penalty", 0.0)
403
+ + self._last_reward_components.get("repetition_penalty", 0.0)
404
+ + self._last_reward_components.get("length_penalty", 0.0),
405
+ 6,
406
+ ),
407
+ invalid_action_penalty=0.0,
408
+ timeout_penalty=0.0,
409
+ reason=f"{action_type} reward computed safely",
410
+ prev_score=round(self.previous_score, 6),
411
+ curr_score=round(curr_score, 6),
412
+ code_changed=bool(new_code.strip() != old_code.strip()),
413
  )
414
  self._last_status = self._build_status(action_type, grade)
415
  self._metrics = current_metrics
416
+ self.previous_score = curr_score
417
+ self.last_code = new_code
418
  self._last_action_type = action_type
 
419
  self._append_history(action_type, self._last_status, self._last_reward.value)
420
 
421
  def _handle_edit(self, code: Optional[str]) -> None:
 
428
 
429
  def _apply_invalid_action(self, reason: str) -> None:
430
  """Record an invalid action without crashing the episode."""
431
+ current_score = _clamp(self.previous_score)
432
+ reward_value = -INVALID_ACTION_PENALTY
433
+ self.reward_history.append(round(reward_value, 6))
434
+ self._last_reward = RewardDetails(
435
+ value=reward_value,
436
+ invalid_action_penalty=INVALID_ACTION_PENALTY,
437
+ reason=reason,
438
+ prev_score=current_score,
439
+ curr_score=current_score,
440
  code_changed=False,
 
441
  )
442
  self._last_status = reason
 
443
  self._append_history("analyze_code", reason, self._last_reward.value)
444
 
445
  def _auto_submit(self) -> None:
 
447
  task = self._task or self._select_task(None)
448
  grade = self._safe_grade(task=task, candidate_code=self._state.current_code, include_hidden=True)
449
  self._apply_grade_to_state(grade, include_hidden=True)
450
+ self.previous_score = _clamp(grade.score)
451
  self._done = True
452
  self._state.done = True
453
  self._last_status = f"Auto-submitted. Final score: {_clamp(grade.score):.3f}"
server/requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  fastapi>=0.115,<1.0
2
- uvicorn[standard]>=0.30,<1.0
3
  pydantic>=2.0,<3.0
 
4
  pytest>=8.0,<9.0
 
1
  fastapi>=0.115,<1.0
2
+ uvicorn>=0.30,<1.0
3
  pydantic>=2.0,<3.0
4
+ openai>=1.0,<3.0
5
  pytest>=8.0,<9.0