vn6295337 Claude Opus 4.5 commited on
Commit
2f094f6
·
1 Parent(s): a55afbf

Skip Lakera check for educational content to avoid false positives

Browse files

Educational questions like "What causes people to hate those who
are different?" were being blocked by Lakera. Now:
- If regex layer marks content as educational, skip Lakera check
- This prevents false positives on legitimate questions about
prejudice, discrimination, civil rights, etc.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (1) hide show
  1. src/api/routes.py +12 -8
src/api/routes.py CHANGED
@@ -90,14 +90,18 @@ async def query_llm(request: Request, query: QueryRequest, api_key: str = Depend
90
 
91
  # ========== LAYER 2: AI-based safety check (Lakera/Gemini) ==========
92
  # Only runs if regex layer passes
93
- toxicity_result = detect_toxicity(query.prompt)
94
- if toxicity_result["is_toxic"]:
95
- categories = ", ".join(toxicity_result["blocked_categories"]) or "harmful content"
96
- metrics.record_request(blocked=True)
97
- raise HTTPException(
98
- status_code=status.HTTP_400_BAD_REQUEST,
99
- detail=f"Security Alert: Content flagged by AI safety ({categories})"
100
- )
 
 
 
 
101
 
102
  # ========== LAYER 3: LLM Execution ==========
103
  response_content, provider_used, latency_ms, error_message, cascade_path = await llm_client.query_llm_cascade(
 
90
 
91
  # ========== LAYER 2: AI-based safety check (Lakera/Gemini) ==========
92
  # Only runs if regex layer passes
93
+ # Skip for educational content (to avoid false positives on questions about hate/prejudice)
94
+ is_educational = hate_result.get("is_educational", False)
95
+
96
+ if not is_educational:
97
+ toxicity_result = detect_toxicity(query.prompt)
98
+ if toxicity_result["is_toxic"]:
99
+ categories = ", ".join(toxicity_result["blocked_categories"]) or "harmful content"
100
+ metrics.record_request(blocked=True)
101
+ raise HTTPException(
102
+ status_code=status.HTTP_400_BAD_REQUEST,
103
+ detail=f"Security Alert: Content flagged by AI safety ({categories})"
104
+ )
105
 
106
  # ========== LAYER 3: LLM Execution ==========
107
  response_content, provider_used, latency_ms, error_message, cascade_path = await llm_client.query_llm_cascade(