Spaces:
Sleeping
Sleeping
Upload 7 files
Browse files- graders.py +21 -11
graders.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
"""
|
| 2 |
FrontierLabs-Env: Deterministic Graders
|
| 3 |
-
All graders return a float in
|
| 4 |
"""
|
| 5 |
|
| 6 |
import json
|
|
@@ -27,7 +27,7 @@ def grade_task1(env_state: Dict[str, Any], get_file) -> Tuple[float, Dict[str, A
|
|
| 27 |
3. metrics_report.json exists (0.1)
|
| 28 |
4. Agent's self-reported F1 matches ground truth F1 (0.4)
|
| 29 |
|
| 30 |
-
Returns score in
|
| 31 |
"""
|
| 32 |
details: Dict[str, Any] = {}
|
| 33 |
score = 0.0
|
|
@@ -37,7 +37,8 @@ def grade_task1(env_state: Dict[str, Any], get_file) -> Tuple[float, Dict[str, A
|
|
| 37 |
if not cleaned_content:
|
| 38 |
details["cleaned_file"] = "MISSING"
|
| 39 |
details["score_breakdown"] = {"file_exists": 0.0, "cleaning_quality": 0.0, "report_exists": 0.0, "self_eval_accuracy": 0.0}
|
| 40 |
-
|
|
|
|
| 41 |
|
| 42 |
details["cleaned_file"] = "EXISTS"
|
| 43 |
score += 0.10
|
|
@@ -72,7 +73,9 @@ def grade_task1(env_state: Dict[str, Any], get_file) -> Tuple[float, Dict[str, A
|
|
| 72 |
if not report_content:
|
| 73 |
details["metrics_report"] = "MISSING"
|
| 74 |
details["score_breakdown"] = {"file_exists": 0.1, "cleaning_quality": round(score - 0.1, 4), "report_exists": 0.0, "self_eval_accuracy": 0.0}
|
| 75 |
-
|
|
|
|
|
|
|
| 76 |
|
| 77 |
details["metrics_report"] = "EXISTS"
|
| 78 |
score += 0.10
|
|
@@ -102,7 +105,8 @@ def grade_task1(env_state: Dict[str, Any], get_file) -> Tuple[float, Dict[str, A
|
|
| 102 |
except (json.JSONDecodeError, ValueError, TypeError) as e:
|
| 103 |
details["self_evaluation"] = f"Parse error: {e}"
|
| 104 |
|
| 105 |
-
|
|
|
|
| 106 |
details["final_score"] = final
|
| 107 |
return final, details
|
| 108 |
|
|
@@ -141,7 +145,8 @@ def grade_task2(env_state: Dict[str, Any], get_file) -> Tuple[float, Dict[str, A
|
|
| 141 |
content = get_file("train_fsdp.py")
|
| 142 |
if not content:
|
| 143 |
details["file"] = "MISSING"
|
| 144 |
-
|
|
|
|
| 145 |
|
| 146 |
details["file"] = "EXISTS"
|
| 147 |
score += 0.10
|
|
@@ -205,7 +210,9 @@ def grade_task2(env_state: Dict[str, Any], get_file) -> Tuple[float, Dict[str, A
|
|
| 205 |
details["memory_simulation"] = {"note": "Skipped — insufficient FSDP implementation", "partial_score": 0.0}
|
| 206 |
|
| 207 |
score += memory_score
|
| 208 |
-
|
|
|
|
|
|
|
| 209 |
details["final_score"] = final
|
| 210 |
return final, details
|
| 211 |
|
|
@@ -248,7 +255,8 @@ def grade_task3(env_state: Dict[str, Any], get_file) -> Tuple[float, Dict[str, A
|
|
| 248 |
content = get_file("fast_silu_kernel.py")
|
| 249 |
if not content:
|
| 250 |
details["file"] = "MISSING"
|
| 251 |
-
|
|
|
|
| 252 |
|
| 253 |
details["file"] = "EXISTS"
|
| 254 |
score += 0.10
|
|
@@ -326,7 +334,8 @@ def grade_task3(env_state: Dict[str, Any], get_file) -> Tuple[float, Dict[str, A
|
|
| 326 |
}
|
| 327 |
score += latency_score
|
| 328 |
|
| 329 |
-
|
|
|
|
| 330 |
details["final_score"] = final
|
| 331 |
return final, details
|
| 332 |
|
|
@@ -347,11 +356,12 @@ def grade(task_id: str, env_state: Dict[str, Any], get_file) -> Dict[str, Any]:
|
|
| 347 |
elif task_id == "task3_triton_kernel":
|
| 348 |
score, details = grade_task3(env_state, get_file)
|
| 349 |
else:
|
| 350 |
-
|
|
|
|
| 351 |
|
| 352 |
return {
|
| 353 |
"task_id": task_id,
|
| 354 |
"score": score,
|
| 355 |
"passed": score >= 0.8,
|
| 356 |
"details": details,
|
| 357 |
-
}
|
|
|
|
| 1 |
"""
|
| 2 |
FrontierLabs-Env: Deterministic Graders
|
| 3 |
+
All graders return a float strictly in (0.0, 1.0) based on strict programmatic analysis.
|
| 4 |
"""
|
| 5 |
|
| 6 |
import json
|
|
|
|
| 27 |
3. metrics_report.json exists (0.1)
|
| 28 |
4. Agent's self-reported F1 matches ground truth F1 (0.4)
|
| 29 |
|
| 30 |
+
Returns score strictly in (0.0, 1.0).
|
| 31 |
"""
|
| 32 |
details: Dict[str, Any] = {}
|
| 33 |
score = 0.0
|
|
|
|
| 37 |
if not cleaned_content:
|
| 38 |
details["cleaned_file"] = "MISSING"
|
| 39 |
details["score_breakdown"] = {"file_exists": 0.0, "cleaning_quality": 0.0, "report_exists": 0.0, "self_eval_accuracy": 0.0}
|
| 40 |
+
# BOUNDARY FIX: Return 0.001 instead of 0.0
|
| 41 |
+
return 0.001, details
|
| 42 |
|
| 43 |
details["cleaned_file"] = "EXISTS"
|
| 44 |
score += 0.10
|
|
|
|
| 73 |
if not report_content:
|
| 74 |
details["metrics_report"] = "MISSING"
|
| 75 |
details["score_breakdown"] = {"file_exists": 0.1, "cleaning_quality": round(score - 0.1, 4), "report_exists": 0.0, "self_eval_accuracy": 0.0}
|
| 76 |
+
# BOUNDARY FIX: Ensure score is strictly > 0.0 and < 1.0
|
| 77 |
+
safe_score = round(min(0.999, max(0.001, score)), 4)
|
| 78 |
+
return safe_score, details
|
| 79 |
|
| 80 |
details["metrics_report"] = "EXISTS"
|
| 81 |
score += 0.10
|
|
|
|
| 105 |
except (json.JSONDecodeError, ValueError, TypeError) as e:
|
| 106 |
details["self_evaluation"] = f"Parse error: {e}"
|
| 107 |
|
| 108 |
+
# BOUNDARY FIX: Clamp score between 0.001 and 0.999
|
| 109 |
+
final = round(min(0.999, max(0.001, score)), 4)
|
| 110 |
details["final_score"] = final
|
| 111 |
return final, details
|
| 112 |
|
|
|
|
| 145 |
content = get_file("train_fsdp.py")
|
| 146 |
if not content:
|
| 147 |
details["file"] = "MISSING"
|
| 148 |
+
# BOUNDARY FIX: Return 0.001 instead of 0.0
|
| 149 |
+
return 0.001, details
|
| 150 |
|
| 151 |
details["file"] = "EXISTS"
|
| 152 |
score += 0.10
|
|
|
|
| 210 |
details["memory_simulation"] = {"note": "Skipped — insufficient FSDP implementation", "partial_score": 0.0}
|
| 211 |
|
| 212 |
score += memory_score
|
| 213 |
+
|
| 214 |
+
# BOUNDARY FIX: Clamp score between 0.001 and 0.999
|
| 215 |
+
final = round(min(0.999, max(0.001, score)), 4)
|
| 216 |
details["final_score"] = final
|
| 217 |
return final, details
|
| 218 |
|
|
|
|
| 255 |
content = get_file("fast_silu_kernel.py")
|
| 256 |
if not content:
|
| 257 |
details["file"] = "MISSING"
|
| 258 |
+
# BOUNDARY FIX: Return 0.001 instead of 0.0
|
| 259 |
+
return 0.001, details
|
| 260 |
|
| 261 |
details["file"] = "EXISTS"
|
| 262 |
score += 0.10
|
|
|
|
| 334 |
}
|
| 335 |
score += latency_score
|
| 336 |
|
| 337 |
+
# BOUNDARY FIX: Clamp score between 0.001 and 0.999
|
| 338 |
+
final = round(min(0.999, max(0.001, score)), 4)
|
| 339 |
details["final_score"] = final
|
| 340 |
return final, details
|
| 341 |
|
|
|
|
| 356 |
elif task_id == "task3_triton_kernel":
|
| 357 |
score, details = grade_task3(env_state, get_file)
|
| 358 |
else:
|
| 359 |
+
# BOUNDARY FIX: Return 0.001 for unknown tasks
|
| 360 |
+
return {"task_id": task_id, "score": 0.001, "details": {"error": f"Unknown task: {task_id}"}, "passed": False}
|
| 361 |
|
| 362 |
return {
|
| 363 |
"task_id": task_id,
|
| 364 |
"score": score,
|
| 365 |
"passed": score >= 0.8,
|
| 366 |
"details": details,
|
| 367 |
+
}
|