Spaces:
Sleeping
Sleeping
Fix Docker base image tag and sync success_criteria across files
Browse files- Dockerfile: python:3.11-slim -> python:3.11.9-slim-bookworm
(specific patch+variant tag avoids manifest digest cache misses
in the validator's Docker registry)
- openenv.yaml: easy success_criteria 0.55 -> 0.62 (matches grader)
- server.py: medium description 0.45 -> 0.50 (matches _grade() check)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- Dockerfile +1 -1
- openenv.yaml +3 -2
- server.py +2 -2
Dockerfile
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
FROM python:3.11-slim
|
| 2 |
|
| 3 |
# HF Spaces runs as a non-root user — create it early
|
| 4 |
RUN useradd -m -u 1000 appuser
|
|
|
|
| 1 |
+
FROM python:3.11.9-slim-bookworm
|
| 2 |
|
| 3 |
# HF Spaces runs as a non-root user — create it early
|
| 4 |
RUN useradd -m -u 1000 appuser
|
openenv.yaml
CHANGED
|
@@ -70,8 +70,9 @@ tasks:
|
|
| 70 |
difficulty: easy
|
| 71 |
description: >
|
| 72 |
Clean dataset (flip_y=0.05), budget=300, max_steps=15.
|
| 73 |
-
Agent must reach validation performance > 0.
|
| 74 |
-
|
|
|
|
| 75 |
|
| 76 |
- id: medium
|
| 77 |
difficulty: medium
|
|
|
|
| 70 |
difficulty: easy
|
| 71 |
description: >
|
| 72 |
Clean dataset (flip_y=0.05), budget=300, max_steps=15.
|
| 73 |
+
Agent must reach validation performance > 0.62.
|
| 74 |
+
Score is normalized over range [0.55, 0.75].
|
| 75 |
+
success_criteria: "current_performance > 0.62"
|
| 76 |
|
| 77 |
- id: medium
|
| 78 |
difficulty: medium
|
server.py
CHANGED
|
@@ -93,9 +93,9 @@ TASKS = {
|
|
| 93 |
"description": (
|
| 94 |
"High noise (flip_y=0.25), budget=150, max_steps=12. "
|
| 95 |
"Agent must reach performance > 0.52 while keeping average "
|
| 96 |
-
"noise selection rate below 0.
|
| 97 |
),
|
| 98 |
-
"success_criteria": "current_performance > 0.52 AND avg noise_ratio < 0.
|
| 99 |
"cfg_overrides": {
|
| 100 |
"data": {"flip_y": 0.25},
|
| 101 |
"budget": 150,
|
|
|
|
| 93 |
"description": (
|
| 94 |
"High noise (flip_y=0.25), budget=150, max_steps=12. "
|
| 95 |
"Agent must reach performance > 0.52 while keeping average "
|
| 96 |
+
"noise selection rate below 0.50. Uncertainty-only strategies fail."
|
| 97 |
),
|
| 98 |
+
"success_criteria": "current_performance > 0.52 AND avg noise_ratio < 0.50",
|
| 99 |
"cfg_overrides": {
|
| 100 |
"data": {"flip_y": 0.25},
|
| 101 |
"budget": 150,
|