Spaces:
Sleeping
Sleeping
Skip Lakera check for educational content to avoid false positives
Browse filesEducational questions like "What causes people to hate those who
are different?" were being blocked by Lakera. Now:
- If regex layer marks content as educational, skip Lakera check
- This prevents false positives on legitimate questions about
prejudice, discrimination, civil rights, etc.
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
- src/api/routes.py +12 -8
src/api/routes.py
CHANGED
|
@@ -90,14 +90,18 @@ async def query_llm(request: Request, query: QueryRequest, api_key: str = Depend
|
|
| 90 |
|
| 91 |
# ========== LAYER 2: AI-based safety check (Lakera/Gemini) ==========
|
| 92 |
# Only runs if regex layer passes
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
# ========== LAYER 3: LLM Execution ==========
|
| 103 |
response_content, provider_used, latency_ms, error_message, cascade_path = await llm_client.query_llm_cascade(
|
|
|
|
| 90 |
|
| 91 |
# ========== LAYER 2: AI-based safety check (Lakera/Gemini) ==========
|
| 92 |
# Only runs if regex layer passes
|
| 93 |
+
# Skip for educational content (to avoid false positives on questions about hate/prejudice)
|
| 94 |
+
is_educational = hate_result.get("is_educational", False)
|
| 95 |
+
|
| 96 |
+
if not is_educational:
|
| 97 |
+
toxicity_result = detect_toxicity(query.prompt)
|
| 98 |
+
if toxicity_result["is_toxic"]:
|
| 99 |
+
categories = ", ".join(toxicity_result["blocked_categories"]) or "harmful content"
|
| 100 |
+
metrics.record_request(blocked=True)
|
| 101 |
+
raise HTTPException(
|
| 102 |
+
status_code=status.HTTP_400_BAD_REQUEST,
|
| 103 |
+
detail=f"Security Alert: Content flagged by AI safety ({categories})"
|
| 104 |
+
)
|
| 105 |
|
| 106 |
# ========== LAYER 3: LLM Execution ==========
|
| 107 |
response_content, provider_used, latency_ms, error_message, cascade_path = await llm_client.query_llm_cascade(
|