uvpatel7271 commited on
Commit
5d806ad
·
1 Parent(s): cbcbc92

final grading logic fixes

Browse files
graders/bug_fix.py CHANGED
@@ -9,7 +9,16 @@ except ImportError:
9
  from models import TaskGrade
10
  from tasks.catalog import ReviewTask
11
 
12
- from .shared import base_grade, compile_code, execute_cases, quality_metrics, similarity_score, summarize_results
 
 
 
 
 
 
 
 
 
13
 
14
 
15
  def grade_bug_fix_task(
@@ -31,16 +40,16 @@ def grade_bug_fix_task(
31
  }
32
 
33
  if not compiled:
34
- partial = round(min(0.2, similarity_score(code, task.reference_code) * 0.2), 3)
35
  details["test_results"] = []
36
  details["test_summary"] = "Code does not compile."
37
  return base_grade(
38
- score=partial,
39
- syntax_score=0.0,
40
  tests_passed=0,
41
  tests_total=len(task.public_cases) + (len(task.hidden_cases) if include_hidden else 0),
42
- quality_score=0.0,
43
- runtime_score=0.0,
44
  timed_out=False,
45
  details=details,
46
  )
@@ -50,26 +59,28 @@ def grade_bug_fix_task(
50
  if result.get("timed_out"):
51
  details["test_results"] = []
52
  details["test_summary"] = result["error"]
 
53
  return base_grade(
54
- score=0.0,
55
- syntax_score=1.0,
56
  tests_passed=0,
57
  tests_total=len(cases),
58
  quality_score=quality["score"],
59
- runtime_score=0.0,
60
  timed_out=True,
61
  details=details,
62
  )
63
  if "error" in result:
64
  details["test_results"] = []
65
  details["test_summary"] = result["error"]
 
66
  return base_grade(
67
- score=0.0,
68
- syntax_score=1.0,
69
  tests_passed=0,
70
  tests_total=len(cases),
71
  quality_score=quality["score"],
72
- runtime_score=0.0,
73
  timed_out=False,
74
  details=details,
75
  )
@@ -78,13 +89,14 @@ def grade_bug_fix_task(
78
  pass_rate = data["passed"] / max(data["total"], 1)
79
  details["test_results"] = data["results"]
80
  details["test_summary"] = summarize_results("Test results", data["results"])
 
81
  return base_grade(
82
- score=pass_rate,
83
- syntax_score=1.0,
84
  tests_passed=data["passed"],
85
  tests_total=data["total"],
86
  quality_score=quality["score"],
87
- runtime_score=0.0,
88
  timed_out=False,
89
  details=details,
90
  )
 
9
  from models import TaskGrade
10
  from tasks.catalog import ReviewTask
11
 
12
+ from .shared import (
13
+ base_grade,
14
+ compile_code,
15
+ component_score,
16
+ execute_cases,
17
+ quality_metrics,
18
+ shaped_score,
19
+ similarity_score,
20
+ summarize_results,
21
+ )
22
 
23
 
24
  def grade_bug_fix_task(
 
40
  }
41
 
42
  if not compiled:
43
+ progress = 0.02 + 0.12 * similarity_score(code, task.reference_code)
44
  details["test_results"] = []
45
  details["test_summary"] = "Code does not compile."
46
  return base_grade(
47
+ score=shaped_score(progress),
48
+ syntax_score=component_score(0.01),
49
  tests_passed=0,
50
  tests_total=len(task.public_cases) + (len(task.hidden_cases) if include_hidden else 0),
51
+ quality_score=component_score(0.01),
52
+ runtime_score=component_score(0.01),
53
  timed_out=False,
54
  details=details,
55
  )
 
59
  if result.get("timed_out"):
60
  details["test_results"] = []
61
  details["test_summary"] = result["error"]
62
+ progress = 0.12 + 0.18 * quality["score"]
63
  return base_grade(
64
+ score=shaped_score(progress),
65
+ syntax_score=component_score(0.95),
66
  tests_passed=0,
67
  tests_total=len(cases),
68
  quality_score=quality["score"],
69
+ runtime_score=component_score(0.01),
70
  timed_out=True,
71
  details=details,
72
  )
73
  if "error" in result:
74
  details["test_results"] = []
75
  details["test_summary"] = result["error"]
76
+ progress = 0.1 + 0.2 * quality["score"]
77
  return base_grade(
78
+ score=shaped_score(progress),
79
+ syntax_score=component_score(0.95),
80
  tests_passed=0,
81
  tests_total=len(cases),
82
  quality_score=quality["score"],
83
+ runtime_score=component_score(0.01),
84
  timed_out=False,
85
  details=details,
86
  )
 
89
  pass_rate = data["passed"] / max(data["total"], 1)
90
  details["test_results"] = data["results"]
91
  details["test_summary"] = summarize_results("Test results", data["results"])
92
+ progress = min(1.0, 0.05 + 0.8 * pass_rate + 0.15 * quality["score"])
93
  return base_grade(
94
+ score=shaped_score(progress),
95
+ syntax_score=component_score(0.95),
96
  tests_passed=data["passed"],
97
  tests_total=data["total"],
98
  quality_score=quality["score"],
99
+ runtime_score=component_score(0.01),
100
  timed_out=False,
101
  details=details,
102
  )
graders/optimization.py CHANGED
@@ -13,8 +13,10 @@ from .shared import (
13
  base_grade,
14
  benchmark_candidate,
15
  compile_code,
 
16
  execute_cases,
17
  quality_metrics,
 
18
  similarity_score,
19
  summarize_results,
20
  )
@@ -39,16 +41,16 @@ def grade_optimization_task(
39
  }
40
 
41
  if not compiled:
42
- partial = round(min(0.15, similarity_score(code, task.reference_code) * 0.15), 3)
43
  details["test_results"] = []
44
  details["test_summary"] = "Code does not compile."
45
  return base_grade(
46
- score=partial,
47
- syntax_score=0.0,
48
  tests_passed=0,
49
  tests_total=len(task.public_cases) + (len(task.hidden_cases) if include_hidden else 0),
50
- quality_score=0.0,
51
- runtime_score=0.0,
52
  timed_out=False,
53
  details=details,
54
  )
@@ -58,33 +60,35 @@ def grade_optimization_task(
58
  if result.get("timed_out"):
59
  details["test_results"] = []
60
  details["test_summary"] = result["error"]
 
61
  return base_grade(
62
- score=0.0,
63
- syntax_score=1.0,
64
  tests_passed=0,
65
  tests_total=len(cases),
66
  quality_score=quality["score"],
67
- runtime_score=0.0,
68
  timed_out=True,
69
  details=details,
70
  )
71
  if "error" in result:
72
  details["test_results"] = []
73
  details["test_summary"] = result["error"]
 
74
  return base_grade(
75
- score=0.0,
76
- syntax_score=1.0,
77
  tests_passed=0,
78
  tests_total=len(cases),
79
  quality_score=quality["score"],
80
- runtime_score=0.0,
81
  timed_out=False,
82
  details=details,
83
  )
84
 
85
  data = result["data"]
86
  pass_rate = data["passed"] / max(data["total"], 1)
87
- runtime_score = 0.0
88
  benchmark_summary = "Benchmark deferred until hidden evaluation."
89
  timed_out = False
90
 
@@ -94,20 +98,21 @@ def grade_optimization_task(
94
  timed_out = benchmark.get("timed_out", False)
95
  benchmark_summary = benchmark["details"]
96
  if timed_out:
97
- runtime_score = 0.0
98
 
99
  details["test_results"] = data["results"]
100
  details["test_summary"] = summarize_results("Test results", data["results"])
101
  details["benchmark"] = benchmark_summary
102
 
 
103
  if include_hidden:
104
- score = 0.5 * pass_rate + 0.3 * runtime_score + 0.2 * quality["score"]
105
  else:
106
- score = 0.7 * pass_rate + 0.3 * quality["score"]
107
 
108
  return base_grade(
109
- score=score,
110
- syntax_score=1.0,
111
  tests_passed=data["passed"],
112
  tests_total=data["total"],
113
  quality_score=quality["score"],
 
13
  base_grade,
14
  benchmark_candidate,
15
  compile_code,
16
+ component_score,
17
  execute_cases,
18
  quality_metrics,
19
+ shaped_score,
20
  similarity_score,
21
  summarize_results,
22
  )
 
41
  }
42
 
43
  if not compiled:
44
+ progress = 0.02 + 0.1 * similarity_score(code, task.reference_code)
45
  details["test_results"] = []
46
  details["test_summary"] = "Code does not compile."
47
  return base_grade(
48
+ score=shaped_score(progress),
49
+ syntax_score=component_score(0.01),
50
  tests_passed=0,
51
  tests_total=len(task.public_cases) + (len(task.hidden_cases) if include_hidden else 0),
52
+ quality_score=component_score(0.01),
53
+ runtime_score=component_score(0.01),
54
  timed_out=False,
55
  details=details,
56
  )
 
60
  if result.get("timed_out"):
61
  details["test_results"] = []
62
  details["test_summary"] = result["error"]
63
+ progress = 0.1 + 0.18 * quality["score"]
64
  return base_grade(
65
+ score=shaped_score(progress),
66
+ syntax_score=component_score(0.95),
67
  tests_passed=0,
68
  tests_total=len(cases),
69
  quality_score=quality["score"],
70
+ runtime_score=component_score(0.01),
71
  timed_out=True,
72
  details=details,
73
  )
74
  if "error" in result:
75
  details["test_results"] = []
76
  details["test_summary"] = result["error"]
77
+ progress = 0.1 + 0.2 * quality["score"]
78
  return base_grade(
79
+ score=shaped_score(progress),
80
+ syntax_score=component_score(0.95),
81
  tests_passed=0,
82
  tests_total=len(cases),
83
  quality_score=quality["score"],
84
+ runtime_score=component_score(0.01),
85
  timed_out=False,
86
  details=details,
87
  )
88
 
89
  data = result["data"]
90
  pass_rate = data["passed"] / max(data["total"], 1)
91
+ runtime_score = component_score(0.01)
92
  benchmark_summary = "Benchmark deferred until hidden evaluation."
93
  timed_out = False
94
 
 
98
  timed_out = benchmark.get("timed_out", False)
99
  benchmark_summary = benchmark["details"]
100
  if timed_out:
101
+ runtime_score = component_score(0.01)
102
 
103
  details["test_results"] = data["results"]
104
  details["test_summary"] = summarize_results("Test results", data["results"])
105
  details["benchmark"] = benchmark_summary
106
 
107
+ runtime_progress = 0.0 if benchmark_summary == "Benchmark deferred until hidden evaluation." else runtime_score
108
  if include_hidden:
109
+ progress = min(1.0, 0.05 + 0.6 * pass_rate + 0.2 * quality["score"] + 0.15 * runtime_progress)
110
  else:
111
+ progress = min(1.0, 0.05 + 0.7 * pass_rate + 0.25 * quality["score"])
112
 
113
  return base_grade(
114
+ score=shaped_score(progress),
115
+ syntax_score=component_score(0.95),
116
  tests_passed=data["passed"],
117
  tests_total=data["total"],
118
  quality_score=quality["score"],
graders/shared.py CHANGED
@@ -4,6 +4,7 @@ from __future__ import annotations
4
 
5
  import ast
6
  import difflib
 
7
  import multiprocessing as mp
8
  import time
9
  import traceback
@@ -17,10 +18,71 @@ except ImportError:
17
  from tasks.catalog import CallCase, ReviewTask
18
 
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def clamp(value: float, lower: float = 0.0, upper: float = 1.0) -> float:
21
  """Clamp a floating-point value to a closed interval."""
22
 
23
- return max(lower, min(upper, value))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
 
26
  def compile_code(code: str) -> tuple[bool, str]:
@@ -157,8 +219,8 @@ def quality_metrics(code: str, function_name: str) -> Dict[str, Any]:
157
  compiled, error = compile_code(code)
158
  if not compiled:
159
  return {
160
- "score": 0.0,
161
- "style_score": 0.0,
162
  "quality_notes": [error],
163
  "max_loop_depth": 99,
164
  }
@@ -238,8 +300,8 @@ def quality_metrics(code: str, function_name: str) -> Dict[str, Any]:
238
  score += 0.15
239
 
240
  return {
241
- "score": round(clamp(score), 3),
242
- "style_score": round(clamp(style_score), 3),
243
  "quality_notes": notes,
244
  "max_loop_depth": max_loop_depth,
245
  }
@@ -294,7 +356,7 @@ def benchmark_candidate(task: ReviewTask, code: str, timeout_s: float) -> Dict[s
294
  """Benchmark a candidate solution against the starter implementation."""
295
 
296
  if not task.benchmark_config:
297
- return {"runtime_score": 0.0, "details": "No benchmark configured."}
298
 
299
  events = build_benchmark_events(task.benchmark_config)
300
  payload = {
@@ -306,15 +368,15 @@ def benchmark_candidate(task: ReviewTask, code: str, timeout_s: float) -> Dict[s
306
  }
307
  result = run_with_timeout(_benchmark_worker, payload, timeout_s=timeout_s)
308
  if result.get("timed_out"):
309
- return {"runtime_score": 0.0, "timed_out": True, "details": result["error"]}
310
  if "error" in result:
311
- return {"runtime_score": 0.0, "timed_out": False, "details": result["error"]}
312
 
313
  data = result["data"]
314
  baseline_seconds = float(data["baseline_seconds"])
315
  candidate_seconds = float(data["candidate_seconds"])
316
  improvement_ratio = baseline_seconds / max(candidate_seconds, 1e-9)
317
- runtime_score = round(clamp((improvement_ratio - 1.0) / 1.5), 3)
318
  return {
319
  "runtime_score": runtime_score,
320
  "timed_out": False,
@@ -352,13 +414,18 @@ def base_grade(
352
  ) -> TaskGrade:
353
  """Create a normalized TaskGrade payload."""
354
 
 
 
 
 
 
355
  return TaskGrade(
356
- score=round(clamp(score), 3),
357
- syntax_score=round(clamp(syntax_score), 3),
358
  tests_passed=tests_passed,
359
  tests_total=tests_total,
360
- quality_score=round(clamp(quality_score), 3),
361
- runtime_score=round(clamp(runtime_score), 3),
362
  timed_out=timed_out,
363
  details=details,
364
  )
 
4
 
5
  import ast
6
  import difflib
7
+ import math
8
  import multiprocessing as mp
9
  import time
10
  import traceback
 
18
  from tasks.catalog import CallCase, ReviewTask
19
 
20
 
21
+ STRICT_SCORE_MIN = 0.01
22
+ STRICT_SCORE_MAX = 0.99
23
+ POOR_SCORE = 0.1
24
+ NEAR_PERFECT_SCORE = 0.95
25
+
26
+
27
+ def finite_float(value: Any, fallback: float = STRICT_SCORE_MIN) -> float:
28
+ """Convert a value into a finite float with a deterministic fallback."""
29
+
30
+ try:
31
+ numeric = float(value)
32
+ except (TypeError, ValueError):
33
+ return fallback
34
+ if math.isnan(numeric) or math.isinf(numeric):
35
+ return fallback
36
+ return numeric
37
+
38
+
39
  def clamp(value: float, lower: float = 0.0, upper: float = 1.0) -> float:
40
  """Clamp a floating-point value to a closed interval."""
41
 
42
+ numeric = finite_float(value, fallback=lower)
43
+ return max(lower, min(upper, numeric))
44
+
45
+
46
+ def strict_score(value: Any, lower: float = STRICT_SCORE_MIN, upper: float = STRICT_SCORE_MAX) -> float:
47
+ """Clamp a score to the OpenEnv-safe open interval (0, 1)."""
48
+
49
+ score = max(lower, min(upper, finite_float(value, fallback=lower)))
50
+ score = round(score, 3)
51
+ assert 0 < score < 1, f"Invalid score: {score}"
52
+ return score
53
+
54
+
55
+ def shaped_score(progress: Any, floor: float = POOR_SCORE, ceiling: float = NEAR_PERFECT_SCORE) -> float:
56
+ """Map progress in [0, 1] to a shaped score band within (0, 1)."""
57
+
58
+ bounded_progress = clamp(finite_float(progress, fallback=0.0))
59
+ score = floor + (ceiling - floor) * bounded_progress
60
+ score = max(STRICT_SCORE_MIN, min(score, STRICT_SCORE_MAX))
61
+ score = round(score, 3)
62
+ assert 0 < score < 1, f"Invalid score: {score}"
63
+ return score
64
+
65
+
66
+ def score_from_checks(passed: int, total: int, floor: float = POOR_SCORE, ceiling: float = NEAR_PERFECT_SCORE) -> float:
67
+ """Convert discrete checks into a smoothly shaped score."""
68
+
69
+ return shaped_score(safe_ratio(passed, total), floor=floor, ceiling=ceiling)
70
+
71
+
72
+ def safe_ratio(numerator: Any, denominator: Any) -> float:
73
+ """Return a stable ratio in [0, 1] that never raises or produces NaN."""
74
+
75
+ denom = int(finite_float(denominator, fallback=0.0))
76
+ if denom <= 0:
77
+ return 0.0
78
+ numer = finite_float(numerator, fallback=0.0)
79
+ return clamp(numer / denom)
80
+
81
+
82
+ def component_score(value: Any) -> float:
83
+ """Normalize component scores such as syntax, quality, and runtime."""
84
+
85
+ return strict_score(value)
86
 
87
 
88
  def compile_code(code: str) -> tuple[bool, str]:
 
219
  compiled, error = compile_code(code)
220
  if not compiled:
221
  return {
222
+ "score": component_score(STRICT_SCORE_MIN),
223
+ "style_score": component_score(STRICT_SCORE_MIN),
224
  "quality_notes": [error],
225
  "max_loop_depth": 99,
226
  }
 
300
  score += 0.15
301
 
302
  return {
303
+ "score": component_score(clamp(score)),
304
+ "style_score": component_score(clamp(style_score)),
305
  "quality_notes": notes,
306
  "max_loop_depth": max_loop_depth,
307
  }
 
356
  """Benchmark a candidate solution against the starter implementation."""
357
 
358
  if not task.benchmark_config:
359
+ return {"runtime_score": component_score(STRICT_SCORE_MIN), "details": "No benchmark configured."}
360
 
361
  events = build_benchmark_events(task.benchmark_config)
362
  payload = {
 
368
  }
369
  result = run_with_timeout(_benchmark_worker, payload, timeout_s=timeout_s)
370
  if result.get("timed_out"):
371
+ return {"runtime_score": component_score(STRICT_SCORE_MIN), "timed_out": True, "details": result["error"]}
372
  if "error" in result:
373
+ return {"runtime_score": component_score(STRICT_SCORE_MIN), "timed_out": False, "details": result["error"]}
374
 
375
  data = result["data"]
376
  baseline_seconds = float(data["baseline_seconds"])
377
  candidate_seconds = float(data["candidate_seconds"])
378
  improvement_ratio = baseline_seconds / max(candidate_seconds, 1e-9)
379
+ runtime_score = component_score(clamp((improvement_ratio - 1.0) / 1.5))
380
  return {
381
  "runtime_score": runtime_score,
382
  "timed_out": False,
 
414
  ) -> TaskGrade:
415
  """Create a normalized TaskGrade payload."""
416
 
417
+ safe_score = strict_score(score)
418
+ safe_syntax_score = component_score(syntax_score)
419
+ safe_quality_score = component_score(quality_score)
420
+ safe_runtime_score = component_score(runtime_score)
421
+
422
  return TaskGrade(
423
+ score=safe_score,
424
+ syntax_score=safe_syntax_score,
425
  tests_passed=tests_passed,
426
  tests_total=tests_total,
427
+ quality_score=safe_quality_score,
428
+ runtime_score=safe_runtime_score,
429
  timed_out=timed_out,
430
  details=details,
431
  )
graders/syntax.py CHANGED
@@ -9,7 +9,16 @@ except ImportError:
9
  from models import TaskGrade
10
  from tasks.catalog import ReviewTask
11
 
12
- from .shared import base_grade, compile_code, execute_cases, quality_metrics, similarity_score, summarize_results
 
 
 
 
 
 
 
 
 
13
 
14
 
15
  def grade_syntax_task(task: ReviewTask, code: str, timeout_s: float = 2.0) -> TaskGrade:
@@ -24,16 +33,16 @@ def grade_syntax_task(task: ReviewTask, code: str, timeout_s: float = 2.0) -> Ta
24
  }
25
 
26
  if not compiled:
27
- partial = round(min(0.7, similarity_score(code, task.reference_code) * 0.7), 3)
28
  details["test_results"] = []
29
  details["test_summary"] = "Code does not compile yet."
30
  return base_grade(
31
- score=partial,
32
- syntax_score=0.0,
33
  tests_passed=0,
34
  tests_total=len(task.public_cases) + len(task.hidden_cases),
35
- quality_score=0.0,
36
- runtime_score=0.0,
37
  timed_out=False,
38
  details=details,
39
  )
@@ -43,26 +52,28 @@ def grade_syntax_task(task: ReviewTask, code: str, timeout_s: float = 2.0) -> Ta
43
  if result.get("timed_out"):
44
  details["test_results"] = []
45
  details["test_summary"] = result["error"]
 
46
  return base_grade(
47
- score=0.8,
48
- syntax_score=1.0,
49
  tests_passed=0,
50
  tests_total=len(cases),
51
  quality_score=quality["score"],
52
- runtime_score=0.0,
53
  timed_out=True,
54
  details=details,
55
  )
56
  if "error" in result:
57
  details["test_results"] = []
58
  details["test_summary"] = result["error"]
 
59
  return base_grade(
60
- score=1.0,
61
- syntax_score=1.0,
62
  tests_passed=0,
63
  tests_total=len(cases),
64
  quality_score=quality["score"],
65
- runtime_score=0.0,
66
  timed_out=False,
67
  details=details,
68
  )
@@ -70,13 +81,15 @@ def grade_syntax_task(task: ReviewTask, code: str, timeout_s: float = 2.0) -> Ta
70
  data = result["data"]
71
  details["test_results"] = data["results"]
72
  details["test_summary"] = summarize_results("Validation checks", data["results"])
 
 
73
  return base_grade(
74
- score=1.0,
75
- syntax_score=1.0,
76
  tests_passed=data["passed"],
77
  tests_total=data["total"],
78
  quality_score=quality["score"],
79
- runtime_score=0.0,
80
  timed_out=False,
81
  details=details,
82
  )
 
9
  from models import TaskGrade
10
  from tasks.catalog import ReviewTask
11
 
12
+ from .shared import (
13
+ base_grade,
14
+ compile_code,
15
+ component_score,
16
+ execute_cases,
17
+ quality_metrics,
18
+ shaped_score,
19
+ similarity_score,
20
+ summarize_results,
21
+ )
22
 
23
 
24
  def grade_syntax_task(task: ReviewTask, code: str, timeout_s: float = 2.0) -> TaskGrade:
 
33
  }
34
 
35
  if not compiled:
36
+ progress = 0.05 + 0.2 * similarity_score(code, task.reference_code)
37
  details["test_results"] = []
38
  details["test_summary"] = "Code does not compile yet."
39
  return base_grade(
40
+ score=shaped_score(progress),
41
+ syntax_score=component_score(0.01),
42
  tests_passed=0,
43
  tests_total=len(task.public_cases) + len(task.hidden_cases),
44
+ quality_score=component_score(0.01),
45
+ runtime_score=component_score(0.01),
46
  timed_out=False,
47
  details=details,
48
  )
 
52
  if result.get("timed_out"):
53
  details["test_results"] = []
54
  details["test_summary"] = result["error"]
55
+ progress = 0.2 + 0.25 * quality["score"]
56
  return base_grade(
57
+ score=shaped_score(progress),
58
+ syntax_score=component_score(0.95),
59
  tests_passed=0,
60
  tests_total=len(cases),
61
  quality_score=quality["score"],
62
+ runtime_score=component_score(0.01),
63
  timed_out=True,
64
  details=details,
65
  )
66
  if "error" in result:
67
  details["test_results"] = []
68
  details["test_summary"] = result["error"]
69
+ progress = 0.18 + 0.2 * quality["score"]
70
  return base_grade(
71
+ score=shaped_score(progress),
72
+ syntax_score=component_score(0.95),
73
  tests_passed=0,
74
  tests_total=len(cases),
75
  quality_score=quality["score"],
76
+ runtime_score=component_score(0.01),
77
  timed_out=False,
78
  details=details,
79
  )
 
81
  data = result["data"]
82
  details["test_results"] = data["results"]
83
  details["test_summary"] = summarize_results("Validation checks", data["results"])
84
+ pass_rate = data["passed"] / max(data["total"], 1)
85
+ progress = min(1.0, 0.15 + 0.75 * pass_rate + 0.1 * quality["score"])
86
  return base_grade(
87
+ score=shaped_score(progress),
88
+ syntax_score=component_score(0.95),
89
  tests_passed=data["passed"],
90
  tests_total=data["total"],
91
  quality_score=quality["score"],
92
+ runtime_score=component_score(0.01),
93
  timed_out=False,
94
  details=details,
95
  )
inference.py CHANGED
@@ -49,6 +49,9 @@ DEFAULT_MODEL_NAME = "mock-model"
49
  API_TIMEOUT_SECONDS = 3.0
50
  API_RETRIES = 1
51
  API_RETRY_DELAY_SECONDS = 0.2
 
 
 
52
 
53
 
54
  def safe_env(name: str, default: str = "") -> str:
@@ -61,14 +64,19 @@ def safe_env(name: str, default: str = "") -> str:
61
 
62
 
63
  def clamp_score(value: Any) -> float:
64
- """Clamp numeric scores to the required 0..1 interval."""
65
  try:
66
- return max(0.0, min(1.0, float(value)))
67
  except Exception:
68
- return 0.0
 
 
 
 
 
69
 
70
 
71
- def safe_float(value: Any, default: float = 0.0) -> float:
72
  """Convert a value to float without raising."""
73
  try:
74
  return float(value)
@@ -163,7 +171,7 @@ def build_prompt(observation: Any) -> str:
163
  task_description = safe_text(safe_getattr(observation, "task_description", ""), "No task description.")
164
  errors = safe_text(safe_getattr(observation, "errors", ""), "none")
165
  tests = safe_text(safe_getattr(observation, "test_results", ""), "not available")
166
- score = clamp_score(safe_getattr(observation, "score", 0.0))
167
  current_code = safe_code(safe_getattr(observation, "current_code", ""), "")
168
  visible_tests = safe_getattr(observation, "visible_tests", [])
169
  if not isinstance(visible_tests, Iterable) or isinstance(visible_tests, (str, bytes)):
@@ -262,10 +270,10 @@ def observation_reward(observation: Any) -> float:
262
  """Extract the scalar step reward from an observation."""
263
  reward = safe_getattr(observation, "reward", None)
264
  if reward is not None:
265
- return max(-1.0, min(1.0, safe_float(reward, 0.0)))
266
  reward_details = safe_getattr(observation, "reward_details", None)
267
- reward_value = safe_getattr(reward_details, "value", 0.0)
268
- return max(-1.0, min(1.0, safe_float(reward_value, 0.0)))
269
 
270
 
271
  def fallback_first_action(task_id: str) -> dict[str, Any]:
@@ -306,22 +314,22 @@ def run_task(task_id: str, client: Any | None, model: str) -> None:
306
  emit_start(task_id)
307
 
308
  if PythonCodeReviewEnvironment is None:
309
- emit_step(1, 0.0)
310
- emit_end(task_id, 0.0, 1)
311
  return
312
 
313
  try:
314
  with redirect_stdout(io.StringIO()), redirect_stderr(io.StringIO()):
315
  env = PythonCodeReviewEnvironment(verbose=False)
316
  except Exception:
317
- emit_step(1, 0.0)
318
- emit_end(task_id, 0.0, 1)
319
  return
320
 
321
  observation = safe_reset(env, task_id)
322
  if observation is None:
323
- emit_step(1, 0.0)
324
- emit_end(task_id, 0.0, 1)
325
  return
326
 
327
  step_count = 0
@@ -347,14 +355,14 @@ def run_task(task_id: str, client: Any | None, model: str) -> None:
347
  next_observation = safe_step(env, make_action(action_payload))
348
  step_count += 1
349
  if next_observation is None:
350
- emit_step(step_count, 0.0)
351
- emit_end(task_id, clamp_score(safe_getattr(final_observation, "score", 0.0)), step_count)
352
  return
353
 
354
  final_observation = next_observation
355
  emit_step(step_count, observation_reward(final_observation))
356
 
357
- emit_end(task_id, clamp_score(safe_getattr(final_observation, "score", 0.0)), step_count)
358
 
359
 
360
  def main() -> int:
@@ -366,8 +374,8 @@ def main() -> int:
366
  run_task(task_id, client, model_name)
367
  except Exception:
368
  emit_start(task_id)
369
- emit_step(1, 0.0)
370
- emit_end(task_id, 0.0, 1)
371
  return 0
372
 
373
 
 
49
  API_TIMEOUT_SECONDS = 3.0
50
  API_RETRIES = 1
51
  API_RETRY_DELAY_SECONDS = 0.2
52
+ MIN_SCORE = 0.01
53
+ POOR_SCORE = 0.1
54
+ MAX_SCORE = 0.99
55
 
56
 
57
  def safe_env(name: str, default: str = "") -> str:
 
64
 
65
 
66
  def clamp_score(value: Any) -> float:
67
+ """Clamp numeric scores to the required open interval (0, 1)."""
68
  try:
69
+ numeric = float(value)
70
  except Exception:
71
+ return MIN_SCORE
72
+ if numeric != numeric or numeric in (float("inf"), float("-inf")):
73
+ return MIN_SCORE
74
+ numeric = max(MIN_SCORE, min(MAX_SCORE, numeric))
75
+ assert 0 < numeric < 1, f"Invalid score: {numeric}"
76
+ return numeric
77
 
78
 
79
+ def safe_float(value: Any, default: float = POOR_SCORE) -> float:
80
  """Convert a value to float without raising."""
81
  try:
82
  return float(value)
 
171
  task_description = safe_text(safe_getattr(observation, "task_description", ""), "No task description.")
172
  errors = safe_text(safe_getattr(observation, "errors", ""), "none")
173
  tests = safe_text(safe_getattr(observation, "test_results", ""), "not available")
174
+ score = clamp_score(safe_getattr(observation, "score", POOR_SCORE))
175
  current_code = safe_code(safe_getattr(observation, "current_code", ""), "")
176
  visible_tests = safe_getattr(observation, "visible_tests", [])
177
  if not isinstance(visible_tests, Iterable) or isinstance(visible_tests, (str, bytes)):
 
270
  """Extract the scalar step reward from an observation."""
271
  reward = safe_getattr(observation, "reward", None)
272
  if reward is not None:
273
+ return clamp_score(safe_float(reward, POOR_SCORE))
274
  reward_details = safe_getattr(observation, "reward_details", None)
275
+ reward_value = safe_getattr(reward_details, "value", POOR_SCORE)
276
+ return clamp_score(safe_float(reward_value, POOR_SCORE))
277
 
278
 
279
  def fallback_first_action(task_id: str) -> dict[str, Any]:
 
314
  emit_start(task_id)
315
 
316
  if PythonCodeReviewEnvironment is None:
317
+ emit_step(1, POOR_SCORE)
318
+ emit_end(task_id, POOR_SCORE, 1)
319
  return
320
 
321
  try:
322
  with redirect_stdout(io.StringIO()), redirect_stderr(io.StringIO()):
323
  env = PythonCodeReviewEnvironment(verbose=False)
324
  except Exception:
325
+ emit_step(1, POOR_SCORE)
326
+ emit_end(task_id, POOR_SCORE, 1)
327
  return
328
 
329
  observation = safe_reset(env, task_id)
330
  if observation is None:
331
+ emit_step(1, POOR_SCORE)
332
+ emit_end(task_id, POOR_SCORE, 1)
333
  return
334
 
335
  step_count = 0
 
355
  next_observation = safe_step(env, make_action(action_payload))
356
  step_count += 1
357
  if next_observation is None:
358
+ emit_step(step_count, POOR_SCORE)
359
+ emit_end(task_id, clamp_score(safe_getattr(final_observation, "score", POOR_SCORE)), step_count)
360
  return
361
 
362
  final_observation = next_observation
363
  emit_step(step_count, observation_reward(final_observation))
364
 
365
+ emit_end(task_id, clamp_score(safe_getattr(final_observation, "score", POOR_SCORE)), step_count)
366
 
367
 
368
  def main() -> int:
 
374
  run_task(task_id, client, model_name)
375
  except Exception:
376
  emit_start(task_id)
377
+ emit_step(1, POOR_SCORE)
378
+ emit_end(task_id, POOR_SCORE, 1)
379
  return 0
380
 
381
 
models.py CHANGED
@@ -20,13 +20,13 @@ class HistoryEntry(BaseModel):
20
  step: int = Field(..., ge=0)
21
  action_type: ActionType
22
  status: str = Field(..., description="Short outcome summary.")
23
- reward: float = Field(..., description="Reward returned for the step.")
24
 
25
 
26
  class RewardDetails(BaseModel):
27
  """Transparent reward decomposition for debugging and training."""
28
 
29
- value: float = Field(..., description="Clamped net reward in [-1.0, 1.0].")
30
  syntax_reward: float = Field(default=0.0)
31
  test_reward: float = Field(default=0.0)
32
  correctness_bonus: float = Field(default=0.0)
@@ -37,8 +37,8 @@ class RewardDetails(BaseModel):
37
  regression_penalty: float = Field(default=0.0)
38
  stagnation_penalty: float = Field(default=0.0)
39
  reason: str = Field(..., description="Human-readable reward explanation.")
40
- prev_score: float = Field(default=0.0, ge=0.0, le=1.0)
41
- curr_score: float = Field(default=0.0, ge=0.0, le=1.0)
42
  code_changed: bool = Field(default=False)
43
 
44
 
@@ -67,9 +67,9 @@ class PythonCodeReviewObservation(Observation):
67
  history: List[HistoryEntry] = Field(default_factory=list)
68
  attempts_remaining: int = Field(..., ge=0)
69
  last_action_status: str = Field(default="")
70
- score: float = Field(..., ge=0.0, le=1.0)
71
  reward_details: RewardDetails = Field(
72
- default_factory=lambda: RewardDetails(value=0.0, reason="Environment reset.")
73
  )
74
 
75
 
@@ -84,7 +84,7 @@ class PythonCodeReviewState(State):
84
  errors: str = Field(default="")
85
  test_results: str = Field(default="")
86
  history: List[HistoryEntry] = Field(default_factory=list)
87
- score: float = Field(default=0.0, ge=0.0, le=1.0)
88
  done: bool = Field(default=False)
89
 
90
 
@@ -117,12 +117,12 @@ class TaskSummary(BaseModel):
117
  class TaskGrade(BaseModel):
118
  """Deterministic grader output."""
119
 
120
- score: float = Field(..., ge=0.0, le=1.0)
121
- syntax_score: float = Field(default=0.0, ge=0.0, le=1.0)
122
  tests_passed: int = Field(default=0, ge=0)
123
  tests_total: int = Field(default=0, ge=0)
124
- quality_score: float = Field(default=0.0, ge=0.0, le=1.0)
125
- runtime_score: float = Field(default=0.0, ge=0.0, le=1.0)
126
  timed_out: bool = Field(default=False)
127
  details: Dict[str, Any] = Field(default_factory=dict)
128
 
 
20
  step: int = Field(..., ge=0)
21
  action_type: ActionType
22
  status: str = Field(..., description="Short outcome summary.")
23
+ reward: float = Field(..., gt=0.0, lt=1.0, description="Reward returned for the step.")
24
 
25
 
26
  class RewardDetails(BaseModel):
27
  """Transparent reward decomposition for debugging and training."""
28
 
29
+ value: float = Field(..., gt=0.0, lt=1.0, description="Clamped net reward in (0.0, 1.0).")
30
  syntax_reward: float = Field(default=0.0)
31
  test_reward: float = Field(default=0.0)
32
  correctness_bonus: float = Field(default=0.0)
 
37
  regression_penalty: float = Field(default=0.0)
38
  stagnation_penalty: float = Field(default=0.0)
39
  reason: str = Field(..., description="Human-readable reward explanation.")
40
+ prev_score: float = Field(default=0.01, gt=0.0, lt=1.0)
41
+ curr_score: float = Field(default=0.01, gt=0.0, lt=1.0)
42
  code_changed: bool = Field(default=False)
43
 
44
 
 
67
  history: List[HistoryEntry] = Field(default_factory=list)
68
  attempts_remaining: int = Field(..., ge=0)
69
  last_action_status: str = Field(default="")
70
+ score: float = Field(..., gt=0.0, lt=1.0)
71
  reward_details: RewardDetails = Field(
72
+ default_factory=lambda: RewardDetails(value=0.1, reason="Environment reset.")
73
  )
74
 
75
 
 
84
  errors: str = Field(default="")
85
  test_results: str = Field(default="")
86
  history: List[HistoryEntry] = Field(default_factory=list)
87
+ score: float = Field(default=0.01, gt=0.0, lt=1.0)
88
  done: bool = Field(default=False)
89
 
90
 
 
117
  class TaskGrade(BaseModel):
118
  """Deterministic grader output."""
119
 
120
+ score: float = Field(..., gt=0.0, lt=1.0)
121
+ syntax_score: float = Field(default=0.01, gt=0.0, lt=1.0)
122
  tests_passed: int = Field(default=0, ge=0)
123
  tests_total: int = Field(default=0, ge=0)
124
+ quality_score: float = Field(default=0.01, gt=0.0, lt=1.0)
125
+ runtime_score: float = Field(default=0.01, gt=0.0, lt=1.0)
126
  timed_out: bool = Field(default=False)
127
  details: Dict[str, Any] = Field(default_factory=dict)
128
 
Dockerfile → server/Dockerfile RENAMED
File without changes
server/env.py CHANGED
@@ -10,6 +10,7 @@ from openenv.core.env_server.types import EnvironmentMetadata
10
 
11
  try:
12
  from ..graders import grade_task
 
13
  from ..models import (
14
  HistoryEntry,
15
  PythonCodeReviewAction,
@@ -21,6 +22,7 @@ try:
21
  from ..tasks import ReviewTask, list_tasks, select_task
22
  except ImportError:
23
  from graders import grade_task
 
24
  from models import (
25
  HistoryEntry,
26
  PythonCodeReviewAction,
@@ -33,11 +35,18 @@ except ImportError:
33
 
34
 
35
  def _empty_grade() -> TaskGrade:
36
- return TaskGrade(score=0.0, syntax_score=0.0, tests_passed=0, tests_total=0, quality_score=0.0, runtime_score=0.0)
 
 
 
 
 
 
 
37
 
38
 
39
- def _clamp(value: float, lower: float = -1.0, upper: float = 1.0) -> float:
40
- return max(lower, min(upper, value))
41
 
42
 
43
  class PythonCodeReviewEnvironment(
@@ -53,7 +62,7 @@ class PythonCodeReviewEnvironment(
53
  self._task: ReviewTask = list_tasks()[0]
54
  self._current_code: str = self._task.starter_code
55
  self._history: list[HistoryEntry] = []
56
- self._last_reward = RewardDetails(value=0.0, reason="Environment initialized.")
57
  self._current_grade = _empty_grade()
58
  self._state = PythonCodeReviewState(episode_id=str(uuid4()), step_count=0)
59
  self.reset()
@@ -68,7 +77,7 @@ class PythonCodeReviewEnvironment(
68
  self._task = select_task(seed=seed, task_id=task_id)
69
  self._current_code = self._task.starter_code
70
  self._history = []
71
- self._last_reward = RewardDetails(value=0.0, reason="Environment reset.")
72
  self._current_grade = grade_task(self._task, self._current_code, include_hidden=False)
73
 
74
  self._state = PythonCodeReviewState(
@@ -117,7 +126,10 @@ class PythonCodeReviewEnvironment(
117
  **kwargs: Any,
118
  ) -> Tuple[PythonCodeReviewObservation, float, bool, Dict[str, Any]]:
119
  if self._state.done:
120
- reward = RewardDetails(value=0.0, reason="Episode already finished. Call reset() to continue.")
 
 
 
121
  observation = self._build_observation(
122
  grade=self._current_grade,
123
  status="Episode already finished.",
@@ -266,22 +278,24 @@ class PythonCodeReviewEnvironment(
266
  ) -> RewardDetails:
267
  prev_score = previous_grade.score
268
  curr_score = current_grade.score
269
- prev_rate = previous_grade.tests_passed / max(previous_grade.tests_total, 1)
270
- curr_rate = current_grade.tests_passed / max(current_grade.tests_total, 1)
271
 
272
- syntax_reward = 0.2 if previous_grade.syntax_score < 1.0 and current_grade.syntax_score >= 1.0 else 0.0
273
- test_reward = round(max(curr_rate - prev_rate, 0.0) * 0.3, 3)
274
- progress_delta = round(max(curr_score - prev_score, 0.0) * 0.4, 3)
275
- quality_bonus = round(max(current_grade.quality_score - previous_grade.quality_score, 0.0) * 0.1, 3)
276
- correctness_bonus = 0.5 if final_submission and curr_score >= 0.999 and prev_score < 0.999 else 0.0
277
 
278
- invalid_action_penalty = 0.1 if invalid_action else 0.0
279
- timeout_penalty = 0.2 if timed_out else 0.0
280
  regression_penalty = round(max(prev_score - curr_score, 0.0) * 0.2, 3)
281
- stagnation_penalty = 0.05 if action.action_type == "edit_code" and not code_changed else 0.0
282
 
283
- value = _clamp(
284
- syntax_reward
 
 
285
  + test_reward
286
  + progress_delta
287
  + quality_bonus
@@ -291,6 +305,7 @@ class PythonCodeReviewEnvironment(
291
  - regression_penalty
292
  - stagnation_penalty
293
  )
 
294
 
295
  reason_parts = []
296
  if syntax_reward:
@@ -315,7 +330,7 @@ class PythonCodeReviewEnvironment(
315
  reason_parts.append("no meaningful state change")
316
 
317
  return RewardDetails(
318
- value=round(value, 3),
319
  syntax_reward=syntax_reward,
320
  test_reward=test_reward,
321
  correctness_bonus=correctness_bonus,
@@ -365,7 +380,7 @@ class PythonCodeReviewEnvironment(
365
 
366
  def _submission_status(self, grade: TaskGrade) -> str:
367
  runtime_text = ""
368
- if grade.runtime_score:
369
  runtime_text = f" runtime {grade.runtime_score:.2f};"
370
  return (
371
  f"Submission graded with score {grade.score:.2f}; "
 
10
 
11
  try:
12
  from ..graders import grade_task
13
+ from ..graders.shared import component_score, safe_ratio, strict_score
14
  from ..models import (
15
  HistoryEntry,
16
  PythonCodeReviewAction,
 
22
  from ..tasks import ReviewTask, list_tasks, select_task
23
  except ImportError:
24
  from graders import grade_task
25
+ from graders.shared import component_score, safe_ratio, strict_score
26
  from models import (
27
  HistoryEntry,
28
  PythonCodeReviewAction,
 
35
 
36
 
37
  def _empty_grade() -> TaskGrade:
38
+ return TaskGrade(
39
+ score=component_score(0.01),
40
+ syntax_score=component_score(0.01),
41
+ tests_passed=0,
42
+ tests_total=0,
43
+ quality_score=component_score(0.01),
44
+ runtime_score=component_score(0.01),
45
+ )
46
 
47
 
48
+ def _reward_value(value: float) -> float:
49
+ return strict_score(value)
50
 
51
 
52
  class PythonCodeReviewEnvironment(
 
62
  self._task: ReviewTask = list_tasks()[0]
63
  self._current_code: str = self._task.starter_code
64
  self._history: list[HistoryEntry] = []
65
+ self._last_reward = RewardDetails(value=0.1, reason="Environment initialized.")
66
  self._current_grade = _empty_grade()
67
  self._state = PythonCodeReviewState(episode_id=str(uuid4()), step_count=0)
68
  self.reset()
 
77
  self._task = select_task(seed=seed, task_id=task_id)
78
  self._current_code = self._task.starter_code
79
  self._history = []
80
+ self._last_reward = RewardDetails(value=0.1, reason="Environment reset.")
81
  self._current_grade = grade_task(self._task, self._current_code, include_hidden=False)
82
 
83
  self._state = PythonCodeReviewState(
 
126
  **kwargs: Any,
127
  ) -> Tuple[PythonCodeReviewObservation, float, bool, Dict[str, Any]]:
128
  if self._state.done:
129
+ reward = RewardDetails(
130
+ value=_reward_value(0.05 + 0.25 * self._current_grade.score),
131
+ reason="Episode already finished. Call reset() to continue.",
132
+ )
133
  observation = self._build_observation(
134
  grade=self._current_grade,
135
  status="Episode already finished.",
 
278
  ) -> RewardDetails:
279
  prev_score = previous_grade.score
280
  curr_score = current_grade.score
281
+ prev_rate = safe_ratio(previous_grade.tests_passed, previous_grade.tests_total)
282
+ curr_rate = safe_ratio(current_grade.tests_passed, current_grade.tests_total)
283
 
284
+ syntax_reward = 0.14 if previous_grade.syntax_score < 0.9 and current_grade.syntax_score >= 0.9 else 0.0
285
+ test_reward = round(max(curr_rate - prev_rate, 0.0) * 0.22, 3)
286
+ progress_delta = round(max(curr_score - prev_score, 0.0) * 0.35, 3)
287
+ quality_bonus = round(max(current_grade.quality_score - previous_grade.quality_score, 0.0) * 0.08, 3)
288
+ correctness_bonus = 0.12 if final_submission and curr_score >= 0.94 and prev_score < 0.94 else 0.0
289
 
290
+ invalid_action_penalty = 0.12 if invalid_action else 0.0
291
+ timeout_penalty = 0.14 if timed_out else 0.0
292
  regression_penalty = round(max(prev_score - curr_score, 0.0) * 0.2, 3)
293
+ stagnation_penalty = 0.06 if action.action_type == "edit_code" and not code_changed else 0.0
294
 
295
+ raw_value = (
296
+ 0.1
297
+ + 0.45 * curr_score
298
+ + syntax_reward
299
  + test_reward
300
  + progress_delta
301
  + quality_bonus
 
305
  - regression_penalty
306
  - stagnation_penalty
307
  )
308
+ value = _reward_value(raw_value)
309
 
310
  reason_parts = []
311
  if syntax_reward:
 
330
  reason_parts.append("no meaningful state change")
331
 
332
  return RewardDetails(
333
+ value=value,
334
  syntax_reward=syntax_reward,
335
  test_reward=test_reward,
336
  correctness_bonus=correctness_bonus,
 
380
 
381
  def _submission_status(self, grade: TaskGrade) -> str:
382
  runtime_text = ""
383
+ if isinstance(grade.details.get("benchmark"), dict):
384
  runtime_text = f" runtime {grade.runtime_score:.2f};"
385
  return (
386
  f"Submission graded with score {grade.score:.2f}; "
tests/test_scoring.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from graders import grade_task
4
+ from models import PythonCodeReviewAction
5
+ from server.env import PythonCodeReviewEnvironment
6
+ from tasks import list_tasks
7
+
8
+
9
+ def assert_open_unit_interval(value: float) -> None:
10
+ assert 0 < value < 1, f"Invalid score: {value}"
11
+
12
+
13
+ def test_task_grades_stay_strictly_between_zero_and_one() -> None:
14
+ for task in list_tasks():
15
+ starter_grade = grade_task(task, task.starter_code, include_hidden=False)
16
+ reference_grade = grade_task(task, task.reference_code, include_hidden=True)
17
+
18
+ for grade in (starter_grade, reference_grade):
19
+ assert_open_unit_interval(grade.score)
20
+ assert_open_unit_interval(grade.syntax_score)
21
+ assert_open_unit_interval(grade.quality_score)
22
+ assert_open_unit_interval(grade.runtime_score)
23
+
24
+
25
+ def test_environment_scores_and_rewards_stay_in_open_interval() -> None:
26
+ env = PythonCodeReviewEnvironment(verbose=False)
27
+ observation = env.reset(task_id="bug_fix_session_windows")
28
+
29
+ assert_open_unit_interval(observation.score)
30
+ assert_open_unit_interval(observation.reward_details.value)
31
+
32
+ no_op_action = PythonCodeReviewAction(action_type="edit_code", code=observation.current_code)
33
+ next_observation, reward, _, _ = env.step_result(no_op_action)
34
+ assert_open_unit_interval(next_observation.score)
35
+ assert_open_unit_interval(reward)
36
+ assert_open_unit_interval(next_observation.reward_details.value)
37
+
38
+ submit_action = PythonCodeReviewAction(action_type="submit_solution", code=env._task.reference_code)
39
+ final_observation, final_reward, _, _ = env.step_result(submit_action)
40
+ assert_open_unit_interval(final_observation.score)
41
+ assert_open_unit_interval(final_reward)
42
+ assert_open_unit_interval(final_observation.reward_details.value)