SafeSpaceAI / src /env /moderation_env.py
Shreya Pal
Make API Key private
5c5b473
import numpy as np
from app.models.toxicity_model import predict_toxicity
class ModerationEnv:
def __init__(self, data):
self.data = data
self.index = 0
def reset(self):
self.index = 0
return self._get_state()
def step(self, action):
text, true_label = self.data[self.index]
reward = self.get_reward(action, true_label)
self.index += 1
done = self.index >= len(self.data)
next_state = None if done else self._get_state()
return next_state, reward, done
# πŸ”₯ NEW: Convert text β†’ state vector
def _get_state(self):
text, _ = self.data[self.index]
ai_scores = predict_toxicity(text)
state = np.array([
ai_scores.get("toxicity", 0.0),
ai_scores.get("insult", 0.0),
ai_scores.get("threat", 0.0),
ai_scores.get("obscene", 0.0),
])
return state
# πŸ”₯ IMPROVED REWARD FUNCTION
def get_reward(self, action, true_label):
"""
action: 0=allow, 1=flag, 2=remove
true_label: "safe", "flag", "remove"
"""
action_map = ["allow", "flag", "remove"]
predicted = action
# βœ… Perfect decision
if predicted == true_label:
return 3
# ⚠️ Slight mistake
if predicted == "flag" and true_label in ["allow", "remove"]:
return 1
# ❌ Dangerous mistakes
if predicted == "allow" and true_label == "remove":
return -4
if predicted == "remove" and true_label == "allow":
return -3
return -1