Spaces:
Sleeping
Sleeping
Shreya Pal commited on
Commit ·
62c3394
1
Parent(s): a44e3ea
Replace algorithmic scoring with Groq contextual LLM moderation
Browse files- requirements.txt +2 -1
- server/app.py +49 -35
requirements.txt
CHANGED
|
@@ -16,4 +16,5 @@ huggingface_hub==0.23.0
|
|
| 16 |
openai
|
| 17 |
openenv-core
|
| 18 |
python-dotenv
|
| 19 |
-
requests
|
|
|
|
|
|
| 16 |
openai
|
| 17 |
openenv-core
|
| 18 |
python-dotenv
|
| 19 |
+
requests
|
| 20 |
+
groq
|
server/app.py
CHANGED
|
@@ -108,40 +108,54 @@ async def state():
|
|
| 108 |
class ModerationRequest(BaseModel):
|
| 109 |
text: str
|
| 110 |
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
|
| 146 |
@app.post("/moderate")
|
| 147 |
def moderate(request: ModerationRequest):
|
|
@@ -168,7 +182,7 @@ def moderate(request: ModerationRequest):
|
|
| 168 |
except Exception as e:
|
| 169 |
hf_scores = {}
|
| 170 |
|
| 171 |
-
llm_result =
|
| 172 |
|
| 173 |
ai_scores = {
|
| 174 |
"toxicity": round(hf_scores.get("toxicity", 0.0), 3),
|
|
|
|
| 108 |
class ModerationRequest(BaseModel):
|
| 109 |
text: str
|
| 110 |
|
| 111 |
+
from groq import Groq
|
| 112 |
+
|
| 113 |
+
def groq_moderate(text: str, hf_scores: dict) -> dict:
|
| 114 |
+
client = Groq(api_key=os.getenv("GROQ_API_KEY"))
|
| 115 |
+
|
| 116 |
+
relevant_keys = ["toxicity", "severe_toxicity", "insult", "threat", "obscene", "identity_attack"]
|
| 117 |
+
filtered_scores = {k: round(hf_scores.get(k, 0.0), 3) for k in relevant_keys if k in hf_scores}
|
| 118 |
+
|
| 119 |
+
response = client.chat.completions.create(
|
| 120 |
+
model="llama3-8b-8192",
|
| 121 |
+
messages=[
|
| 122 |
+
{"role": "system", "content": """You are an expert content moderation AI.
|
| 123 |
+
|
| 124 |
+
You will receive text and toxicity scores (0.0-1.0) from a RoBERTa model.
|
| 125 |
+
|
| 126 |
+
Make a decision based on FULL CONTEXT and INTENT — not just keywords. Consider:
|
| 127 |
+
- Sarcasm or dark humour that looks toxic but isn't harmful
|
| 128 |
+
- Context that changes meaning ("I'll destroy you at chess" is fine)
|
| 129 |
+
- Whether content genuinely targets a person harmfully
|
| 130 |
+
- Mild insults like "idiot" or "stupid" should be FLAG not REMOVE
|
| 131 |
+
|
| 132 |
+
Respond ONLY with valid JSON, no markdown:
|
| 133 |
+
{"decision": "allow" or "flag" or "remove", "confidence": <0.0-1.0>, "explanation": "<1 sentence>"}
|
| 134 |
+
|
| 135 |
+
allow = safe content
|
| 136 |
+
flag = mildly toxic, rude, or ambiguous
|
| 137 |
+
remove = genuine hate speech, real threats, severe harassment"""},
|
| 138 |
+
{"role": "user", "content": f'Text: "{text}"\nScores: {json.dumps(filtered_scores)}\nModerate this.'}
|
| 139 |
+
],
|
| 140 |
+
temperature=0.1,
|
| 141 |
+
max_tokens=100,
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
raw = response.choices[0].message.content.strip()
|
| 145 |
+
|
| 146 |
+
if raw.startswith("```"):
|
| 147 |
+
raw = raw.split("```")[1]
|
| 148 |
+
if raw.startswith("json"):
|
| 149 |
+
raw = raw[4:]
|
| 150 |
+
raw = raw.strip()
|
| 151 |
+
|
| 152 |
+
result = json.loads(raw)
|
| 153 |
+
result["decision"] = result.get("decision", "flag").lower()
|
| 154 |
+
if result["decision"] not in ("allow", "flag", "remove"):
|
| 155 |
+
result["decision"] = "flag"
|
| 156 |
+
result["confidence"] = min(max(float(result.get("confidence", 0.5)), 0.0), 1.0)
|
| 157 |
+
result["explanation"] = result.get("explanation", "No explanation provided.")
|
| 158 |
+
return result
|
| 159 |
|
| 160 |
@app.post("/moderate")
|
| 161 |
def moderate(request: ModerationRequest):
|
|
|
|
| 182 |
except Exception as e:
|
| 183 |
hf_scores = {}
|
| 184 |
|
| 185 |
+
llm_result = groq_moderate(text, hf_scores)
|
| 186 |
|
| 187 |
ai_scores = {
|
| 188 |
"toxicity": round(hf_scores.get("toxicity", 0.0), 3),
|