aryxn323 commited on
Commit
e9652e1
·
verified ·
1 Parent(s): 75ec31b

Upload 7 files

Browse files
Files changed (1) hide show
  1. graders.py +21 -11
graders.py CHANGED
@@ -1,6 +1,6 @@
1
  """
2
  FrontierLabs-Env: Deterministic Graders
3
- All graders return a float in [0.0, 1.0] based on strict programmatic analysis.
4
  """
5
 
6
  import json
@@ -27,7 +27,7 @@ def grade_task1(env_state: Dict[str, Any], get_file) -> Tuple[float, Dict[str, A
27
  3. metrics_report.json exists (0.1)
28
  4. Agent's self-reported F1 matches ground truth F1 (0.4)
29
 
30
- Returns score in [0.0, 1.0].
31
  """
32
  details: Dict[str, Any] = {}
33
  score = 0.0
@@ -37,7 +37,8 @@ def grade_task1(env_state: Dict[str, Any], get_file) -> Tuple[float, Dict[str, A
37
  if not cleaned_content:
38
  details["cleaned_file"] = "MISSING"
39
  details["score_breakdown"] = {"file_exists": 0.0, "cleaning_quality": 0.0, "report_exists": 0.0, "self_eval_accuracy": 0.0}
40
- return 0.0, details
 
41
 
42
  details["cleaned_file"] = "EXISTS"
43
  score += 0.10
@@ -72,7 +73,9 @@ def grade_task1(env_state: Dict[str, Any], get_file) -> Tuple[float, Dict[str, A
72
  if not report_content:
73
  details["metrics_report"] = "MISSING"
74
  details["score_breakdown"] = {"file_exists": 0.1, "cleaning_quality": round(score - 0.1, 4), "report_exists": 0.0, "self_eval_accuracy": 0.0}
75
- return round(score, 4), details
 
 
76
 
77
  details["metrics_report"] = "EXISTS"
78
  score += 0.10
@@ -102,7 +105,8 @@ def grade_task1(env_state: Dict[str, Any], get_file) -> Tuple[float, Dict[str, A
102
  except (json.JSONDecodeError, ValueError, TypeError) as e:
103
  details["self_evaluation"] = f"Parse error: {e}"
104
 
105
- final = round(min(1.0, max(0.0, score)), 4)
 
106
  details["final_score"] = final
107
  return final, details
108
 
@@ -141,7 +145,8 @@ def grade_task2(env_state: Dict[str, Any], get_file) -> Tuple[float, Dict[str, A
141
  content = get_file("train_fsdp.py")
142
  if not content:
143
  details["file"] = "MISSING"
144
- return 0.0, details
 
145
 
146
  details["file"] = "EXISTS"
147
  score += 0.10
@@ -205,7 +210,9 @@ def grade_task2(env_state: Dict[str, Any], get_file) -> Tuple[float, Dict[str, A
205
  details["memory_simulation"] = {"note": "Skipped — insufficient FSDP implementation", "partial_score": 0.0}
206
 
207
  score += memory_score
208
- final = round(min(1.0, max(0.0, score)), 4)
 
 
209
  details["final_score"] = final
210
  return final, details
211
 
@@ -248,7 +255,8 @@ def grade_task3(env_state: Dict[str, Any], get_file) -> Tuple[float, Dict[str, A
248
  content = get_file("fast_silu_kernel.py")
249
  if not content:
250
  details["file"] = "MISSING"
251
- return 0.0, details
 
252
 
253
  details["file"] = "EXISTS"
254
  score += 0.10
@@ -326,7 +334,8 @@ def grade_task3(env_state: Dict[str, Any], get_file) -> Tuple[float, Dict[str, A
326
  }
327
  score += latency_score
328
 
329
- final = round(min(1.0, max(0.0, score)), 4)
 
330
  details["final_score"] = final
331
  return final, details
332
 
@@ -347,11 +356,12 @@ def grade(task_id: str, env_state: Dict[str, Any], get_file) -> Dict[str, Any]:
347
  elif task_id == "task3_triton_kernel":
348
  score, details = grade_task3(env_state, get_file)
349
  else:
350
- return {"task_id": task_id, "score": 0.0, "details": {"error": f"Unknown task: {task_id}"}, "passed": False}
 
351
 
352
  return {
353
  "task_id": task_id,
354
  "score": score,
355
  "passed": score >= 0.8,
356
  "details": details,
357
- }
 
1
  """
2
  FrontierLabs-Env: Deterministic Graders
3
+ All graders return a float strictly in (0.0, 1.0) based on strict programmatic analysis.
4
  """
5
 
6
  import json
 
27
  3. metrics_report.json exists (0.1)
28
  4. Agent's self-reported F1 matches ground truth F1 (0.4)
29
 
30
+ Returns score strictly in (0.0, 1.0).
31
  """
32
  details: Dict[str, Any] = {}
33
  score = 0.0
 
37
  if not cleaned_content:
38
  details["cleaned_file"] = "MISSING"
39
  details["score_breakdown"] = {"file_exists": 0.0, "cleaning_quality": 0.0, "report_exists": 0.0, "self_eval_accuracy": 0.0}
40
+ # BOUNDARY FIX: Return 0.001 instead of 0.0
41
+ return 0.001, details
42
 
43
  details["cleaned_file"] = "EXISTS"
44
  score += 0.10
 
73
  if not report_content:
74
  details["metrics_report"] = "MISSING"
75
  details["score_breakdown"] = {"file_exists": 0.1, "cleaning_quality": round(score - 0.1, 4), "report_exists": 0.0, "self_eval_accuracy": 0.0}
76
+ # BOUNDARY FIX: Ensure score is strictly > 0.0 and < 1.0
77
+ safe_score = round(min(0.999, max(0.001, score)), 4)
78
+ return safe_score, details
79
 
80
  details["metrics_report"] = "EXISTS"
81
  score += 0.10
 
105
  except (json.JSONDecodeError, ValueError, TypeError) as e:
106
  details["self_evaluation"] = f"Parse error: {e}"
107
 
108
+ # BOUNDARY FIX: Clamp score between 0.001 and 0.999
109
+ final = round(min(0.999, max(0.001, score)), 4)
110
  details["final_score"] = final
111
  return final, details
112
 
 
145
  content = get_file("train_fsdp.py")
146
  if not content:
147
  details["file"] = "MISSING"
148
+ # BOUNDARY FIX: Return 0.001 instead of 0.0
149
+ return 0.001, details
150
 
151
  details["file"] = "EXISTS"
152
  score += 0.10
 
210
  details["memory_simulation"] = {"note": "Skipped — insufficient FSDP implementation", "partial_score": 0.0}
211
 
212
  score += memory_score
213
+
214
+ # BOUNDARY FIX: Clamp score between 0.001 and 0.999
215
+ final = round(min(0.999, max(0.001, score)), 4)
216
  details["final_score"] = final
217
  return final, details
218
 
 
255
  content = get_file("fast_silu_kernel.py")
256
  if not content:
257
  details["file"] = "MISSING"
258
+ # BOUNDARY FIX: Return 0.001 instead of 0.0
259
+ return 0.001, details
260
 
261
  details["file"] = "EXISTS"
262
  score += 0.10
 
334
  }
335
  score += latency_score
336
 
337
+ # BOUNDARY FIX: Clamp score between 0.001 and 0.999
338
+ final = round(min(0.999, max(0.001, score)), 4)
339
  details["final_score"] = final
340
  return final, details
341
 
 
356
  elif task_id == "task3_triton_kernel":
357
  score, details = grade_task3(env_state, get_file)
358
  else:
359
+ # BOUNDARY FIX: Return 0.001 for unknown tasks
360
+ return {"task_id": task_id, "score": 0.001, "details": {"error": f"Unknown task: {task_id}"}, "passed": False}
361
 
362
  return {
363
  "task_id": task_id,
364
  "score": score,
365
  "passed": score >= 0.8,
366
  "details": details,
367
+ }