| |
| """ |
| Task: thread-moderation-hard |
| Difficulty: Hard |
| Content types: whatsapp_message (full threads) |
| Steps per episode: 15 |
| Description: |
| This is the hardest task. Per step, the agent sees ONE message in a thread |
| PLUS the full prior thread history (context window grows). |
| Agent must: |
| 1. Multi-label classify the current message |
| 2. Account for cultural/regional context (author_region, language) |
| 3. Resolve conflicting policies (e.g. political speech vs hate speech) |
| 4. Decide action for the THREAD as a whole at the final message |
| Grader checks: |
| - Per-message label accuracy (partial credit) |
| - Thread-level action accuracy (full credit) |
| - Reasoning quality on conflicting policy cases (scored by keyword check) |
| - False positive rate on political speech protected content |
| """ |
| from __future__ import annotations |
| from models import ContentItem, ModerationObservation |
| from server.dataset import get_whatsapp_threads, get_community_standards |
|
|
| TASK_NAME = "thread-moderation-hard" |
| MAX_STEPS = 15 |
| INSTRUCTIONS = ( |
| "You are a senior content moderator handling a WhatsApp message thread. " |
| "You see messages one at a time, with full prior context. " |
| "For EACH message: assign all applicable violation labels, choose an action. " |
| "For the FINAL message in a thread: also decide the thread-level action. " |
| "IMPORTANT: Political speech is protected under CS_POLITICAL_001 even if it sounds harsh. " |
| "Hate speech that uses political framing is NOT protected — use your judgment. " |
| "When policies conflict, cite both rule IDs in policy_citations and explain in reasoning. " |
| "Respond with a JSON ModerationDecision." |
| ) |
|
|
|
|
| def build_episode(seed: int = 42) -> list[tuple[ContentItem, list[ContentItem], list[str]]]: |
| """ |
| Returns list of (current_message, thread_history_so_far, conflicting_policy_ids). |
| One entry per step across all threads in the episode. |
| """ |
| threads = get_whatsapp_threads(seed)[:3] |
| steps = [] |
| policies = get_community_standards() |
| conflict_policy_ids = ["CS_HATE_001", "CS_POLITICAL_001"] |
|
|
| for thread in threads: |
| messages = thread["messages"] |
| history = [] |
| for msg in messages: |
| item = ContentItem(**{k: v for k, v in msg.items() if k != "position"}) |
| conflicts = conflict_policy_ids if thread.get("difficulty") == "hard" else [] |
| steps.append((item, list(history), conflicts)) |
| history.append(item) |
|
|
| return steps[:MAX_STEPS] |
|
|
|
|
| def build_observation( |
| step: int, |
| item: ContentItem, |
| history: list[ContentItem], |
| conflicting_policies: list[str], |
| ) -> ModerationObservation: |
| policies = get_community_standards() |
| policy_text = "\n".join( |
| f"[{p['id']}] {p['name']}: {p['description']}" |
| for p in policies["policies"] |
| ) |
| return ModerationObservation( |
| step=step, |
| content_item=item, |
| policy_excerpt=policy_text, |
| thread_history=history, |
| conflicting_policies=conflicting_policies, |
| task_name=TASK_NAME, |
| instructions=INSTRUCTIONS, |
| ) |
|
|