Shreya Pal commited on
Commit
62c3394
·
1 Parent(s): a44e3ea

Replace algorithmic scoring with Groq contextual LLM moderation

Browse files
Files changed (2) hide show
  1. requirements.txt +2 -1
  2. server/app.py +49 -35
requirements.txt CHANGED
@@ -16,4 +16,5 @@ huggingface_hub==0.23.0
16
  openai
17
  openenv-core
18
  python-dotenv
19
- requests
 
 
16
  openai
17
  openenv-core
18
  python-dotenv
19
+ requests
20
+ groq
server/app.py CHANGED
@@ -108,40 +108,54 @@ async def state():
108
  class ModerationRequest(BaseModel):
109
  text: str
110
 
111
- def score_based_moderate(text: str, hf_scores: dict) -> dict:
112
- toxicity = hf_scores.get("toxicity", 0.0)
113
- threat = hf_scores.get("threat", 0.0)
114
- insult = hf_scores.get("insult", 0.0)
115
- obscene = hf_scores.get("obscene", 0.0)
116
- severe = hf_scores.get("severe_toxicity", 0.0)
117
- identity = hf_scores.get("identity_attack", 0.0)
118
-
119
- top = max(toxicity, threat, insult, obscene, severe, identity)
120
-
121
- # REMOVE: only genuinely harmful content
122
- if severe > 0.4 or threat > 0.7 or (toxicity > 0.85 and identity > 0.5):
123
- return {"decision": "remove", "confidence": round(min(0.95, top + 0.1), 2),
124
- "explanation": "Content contains severe toxicity, a credible threat, or targeted hate speech."}
125
-
126
- # REMOVE: very high combined scores
127
- elif toxicity > 0.85 and insult > 0.85:
128
- return {"decision": "remove", "confidence": round(top, 2),
129
- "explanation": "Highly toxic and insulting content that violates community guidelines."}
130
-
131
- # FLAG: mildly toxic or insulting — needs human review
132
- elif toxicity > 0.6 or insult > 0.7 or top > 0.6:
133
- return {"decision": "flag", "confidence": round(top, 2),
134
- "explanation": "Mildly toxic or insulting content. Flagged for human review."}
135
-
136
- # FLAG: borderline
137
- elif top > 0.4:
138
- return {"decision": "flag", "confidence": round(top, 2),
139
- "explanation": "Potentially offensive content detected. Flagged for review."}
140
-
141
- # ALLOW: safe
142
- else:
143
- return {"decision": "allow", "confidence": round(1.0 - top, 2),
144
- "explanation": "Content appears safe with low toxicity scores."}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
  @app.post("/moderate")
147
  def moderate(request: ModerationRequest):
@@ -168,7 +182,7 @@ def moderate(request: ModerationRequest):
168
  except Exception as e:
169
  hf_scores = {}
170
 
171
- llm_result = score_based_moderate(text, hf_scores)
172
 
173
  ai_scores = {
174
  "toxicity": round(hf_scores.get("toxicity", 0.0), 3),
 
108
  class ModerationRequest(BaseModel):
109
  text: str
110
 
111
+ from groq import Groq
112
+
113
+ def groq_moderate(text: str, hf_scores: dict) -> dict:
114
+ client = Groq(api_key=os.getenv("GROQ_API_KEY"))
115
+
116
+ relevant_keys = ["toxicity", "severe_toxicity", "insult", "threat", "obscene", "identity_attack"]
117
+ filtered_scores = {k: round(hf_scores.get(k, 0.0), 3) for k in relevant_keys if k in hf_scores}
118
+
119
+ response = client.chat.completions.create(
120
+ model="llama3-8b-8192",
121
+ messages=[
122
+ {"role": "system", "content": """You are an expert content moderation AI.
123
+
124
+ You will receive text and toxicity scores (0.0-1.0) from a RoBERTa model.
125
+
126
+ Make a decision based on FULL CONTEXT and INTENT — not just keywords. Consider:
127
+ - Sarcasm or dark humour that looks toxic but isn't harmful
128
+ - Context that changes meaning ("I'll destroy you at chess" is fine)
129
+ - Whether content genuinely targets a person harmfully
130
+ - Mild insults like "idiot" or "stupid" should be FLAG not REMOVE
131
+
132
+ Respond ONLY with valid JSON, no markdown:
133
+ {"decision": "allow" or "flag" or "remove", "confidence": <0.0-1.0>, "explanation": "<1 sentence>"}
134
+
135
+ allow = safe content
136
+ flag = mildly toxic, rude, or ambiguous
137
+ remove = genuine hate speech, real threats, severe harassment"""},
138
+ {"role": "user", "content": f'Text: "{text}"\nScores: {json.dumps(filtered_scores)}\nModerate this.'}
139
+ ],
140
+ temperature=0.1,
141
+ max_tokens=100,
142
+ )
143
+
144
+ raw = response.choices[0].message.content.strip()
145
+
146
+ if raw.startswith("```"):
147
+ raw = raw.split("```")[1]
148
+ if raw.startswith("json"):
149
+ raw = raw[4:]
150
+ raw = raw.strip()
151
+
152
+ result = json.loads(raw)
153
+ result["decision"] = result.get("decision", "flag").lower()
154
+ if result["decision"] not in ("allow", "flag", "remove"):
155
+ result["decision"] = "flag"
156
+ result["confidence"] = min(max(float(result.get("confidence", 0.5)), 0.0), 1.0)
157
+ result["explanation"] = result.get("explanation", "No explanation provided.")
158
+ return result
159
 
160
  @app.post("/moderate")
161
  def moderate(request: ModerationRequest):
 
182
  except Exception as e:
183
  hf_scores = {}
184
 
185
+ llm_result = groq_moderate(text, hf_scores)
186
 
187
  ai_scores = {
188
  "toxicity": round(hf_scores.get("toxicity", 0.0), 3),