Spaces:
Sleeping
Sleeping
Commit ·
28070b8
1
Parent(s): 53ae9f0
Fix rewards
Browse files- openenv.yaml +2 -2
- server/environment.py +8 -6
openenv.yaml
CHANGED
|
@@ -46,9 +46,9 @@ api:
|
|
| 46 |
docs: GET /docs
|
| 47 |
|
| 48 |
reward:
|
| 49 |
-
range: [
|
| 50 |
partial: true
|
| 51 |
-
terminal_bonus: 0.
|
| 52 |
|
| 53 |
observation_space:
|
| 54 |
type: object
|
|
|
|
| 46 |
docs: GET /docs
|
| 47 |
|
| 48 |
reward:
|
| 49 |
+
range: [0.001, 0.999]
|
| 50 |
partial: true
|
| 51 |
+
terminal_bonus: 0.0
|
| 52 |
|
| 53 |
observation_space:
|
| 54 |
type: object
|
server/environment.py
CHANGED
|
@@ -33,7 +33,7 @@ class DataCleaningEnvironment:
|
|
| 33 |
self._step_count: int = 0
|
| 34 |
self._max_steps: int = 20
|
| 35 |
self._total_errors: int = 0
|
| 36 |
-
self._last_score: float = 0.
|
| 37 |
self._task_cycle: int = 0 # for round-robin default
|
| 38 |
|
| 39 |
# ------------------------------------------------------------------
|
|
@@ -76,20 +76,22 @@ class DataCleaningEnvironment:
|
|
| 76 |
score_after = self._compute_score()
|
| 77 |
self._last_score = score_after
|
| 78 |
|
| 79 |
-
delta
|
| 80 |
if not applied:
|
| 81 |
-
reward =
|
| 82 |
elif delta <= 0:
|
| 83 |
-
reward =
|
| 84 |
else:
|
| 85 |
reward = round(delta, 4)
|
| 86 |
|
| 87 |
done = (score_after >= 0.95) or (self._step_count >= self._max_steps)
|
| 88 |
-
|
| 89 |
-
|
|
|
|
| 90 |
|
| 91 |
return self._build_obs(reward, done, message)
|
| 92 |
|
|
|
|
| 93 |
def state(self) -> DataCleaningState:
|
| 94 |
if self._df is None:
|
| 95 |
return DataCleaningState(
|
|
|
|
| 33 |
self._step_count: int = 0
|
| 34 |
self._max_steps: int = 20
|
| 35 |
self._total_errors: int = 0
|
| 36 |
+
self._last_score: float = 0.001
|
| 37 |
self._task_cycle: int = 0 # for round-robin default
|
| 38 |
|
| 39 |
# ------------------------------------------------------------------
|
|
|
|
| 76 |
score_after = self._compute_score()
|
| 77 |
self._last_score = score_after
|
| 78 |
|
| 79 |
+
delta = score_after - score_before
|
| 80 |
if not applied:
|
| 81 |
+
reward = 0.001
|
| 82 |
elif delta <= 0:
|
| 83 |
+
reward = 0.001
|
| 84 |
else:
|
| 85 |
reward = round(delta, 4)
|
| 86 |
|
| 87 |
done = (score_after >= 0.95) or (self._step_count >= self._max_steps)
|
| 88 |
+
|
| 89 |
+
# Clamp reward strictly within (0.001, 0.999) — no terminal bonus
|
| 90 |
+
reward = round(max(0.001, min(0.999, reward)), 4)
|
| 91 |
|
| 92 |
return self._build_obs(reward, done, message)
|
| 93 |
|
| 94 |
+
|
| 95 |
def state(self) -> DataCleaningState:
|
| 96 |
if self._df is None:
|
| 97 |
return DataCleaningState(
|