Spaces:
Sleeping
Sleeping
| import numpy as np | |
| from app.models.toxicity_model import predict_toxicity | |
| class ModerationEnv: | |
| def __init__(self, data): | |
| self.data = data | |
| self.index = 0 | |
| def reset(self): | |
| self.index = 0 | |
| return self._get_state() | |
| def step(self, action): | |
| text, true_label = self.data[self.index] | |
| reward = self.get_reward(action, true_label) | |
| self.index += 1 | |
| done = self.index >= len(self.data) | |
| next_state = None if done else self._get_state() | |
| return next_state, reward, done | |
| # π₯ NEW: Convert text β state vector | |
| def _get_state(self): | |
| text, _ = self.data[self.index] | |
| ai_scores = predict_toxicity(text) | |
| state = np.array([ | |
| ai_scores.get("toxicity", 0.0), | |
| ai_scores.get("insult", 0.0), | |
| ai_scores.get("threat", 0.0), | |
| ai_scores.get("obscene", 0.0), | |
| ]) | |
| return state | |
| # π₯ IMPROVED REWARD FUNCTION | |
| def get_reward(self, action, true_label): | |
| """ | |
| action: 0=allow, 1=flag, 2=remove | |
| true_label: "safe", "flag", "remove" | |
| """ | |
| action_map = ["allow", "flag", "remove"] | |
| predicted = action | |
| # β Perfect decision | |
| if predicted == true_label: | |
| return 3 | |
| # β οΈ Slight mistake | |
| if predicted == "flag" and true_label in ["allow", "remove"]: | |
| return 1 | |
| # β Dangerous mistakes | |
| if predicted == "allow" and true_label == "remove": | |
| return -4 | |
| if predicted == "remove" and true_label == "allow": | |
| return -3 | |
| return -1 |