srishtichugh commited on
Commit
28070b8
·
1 Parent(s): 53ae9f0

Fix rewards

Browse files
Files changed (2) hide show
  1. openenv.yaml +2 -2
  2. server/environment.py +8 -6
openenv.yaml CHANGED
@@ -46,9 +46,9 @@ api:
46
  docs: GET /docs
47
 
48
  reward:
49
- range: [-0.05, 1.2]
50
  partial: true
51
- terminal_bonus: 0.2
52
 
53
  observation_space:
54
  type: object
 
46
  docs: GET /docs
47
 
48
  reward:
49
+ range: [0.001, 0.999]
50
  partial: true
51
+ terminal_bonus: 0.0
52
 
53
  observation_space:
54
  type: object
server/environment.py CHANGED
@@ -33,7 +33,7 @@ class DataCleaningEnvironment:
33
  self._step_count: int = 0
34
  self._max_steps: int = 20
35
  self._total_errors: int = 0
36
- self._last_score: float = 0.0
37
  self._task_cycle: int = 0 # for round-robin default
38
 
39
  # ------------------------------------------------------------------
@@ -76,20 +76,22 @@ class DataCleaningEnvironment:
76
  score_after = self._compute_score()
77
  self._last_score = score_after
78
 
79
- delta = score_after - score_before
80
  if not applied:
81
- reward = -0.05
82
  elif delta <= 0:
83
- reward = -0.01
84
  else:
85
  reward = round(delta, 4)
86
 
87
  done = (score_after >= 0.95) or (self._step_count >= self._max_steps)
88
- if done and score_after >= 0.95:
89
- reward = round(reward + 0.2, 4)
 
90
 
91
  return self._build_obs(reward, done, message)
92
 
 
93
  def state(self) -> DataCleaningState:
94
  if self._df is None:
95
  return DataCleaningState(
 
33
  self._step_count: int = 0
34
  self._max_steps: int = 20
35
  self._total_errors: int = 0
36
+ self._last_score: float = 0.001
37
  self._task_cycle: int = 0 # for round-robin default
38
 
39
  # ------------------------------------------------------------------
 
76
  score_after = self._compute_score()
77
  self._last_score = score_after
78
 
79
+ delta = score_after - score_before
80
  if not applied:
81
+ reward = 0.001
82
  elif delta <= 0:
83
+ reward = 0.001
84
  else:
85
  reward = round(delta, 4)
86
 
87
  done = (score_after >= 0.95) or (self._step_count >= self._max_steps)
88
+
89
+ # Clamp reward strictly within (0.001, 0.999) — no terminal bonus
90
+ reward = round(max(0.001, min(0.999, reward)), 4)
91
 
92
  return self._build_obs(reward, done, message)
93
 
94
+
95
  def state(self) -> DataCleaningState:
96
  if self._df is None:
97
  return DataCleaningState(