File size: 3,181 Bytes
0ee3210 1b50e57 0ee3210 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 | # app/tasks/task_thread_hard.py
"""
Task: thread-moderation-hard
Difficulty: Hard
Content types: whatsapp_message (full threads)
Steps per episode: 15
Description:
This is the hardest task. Per step, the agent sees ONE message in a thread
PLUS the full prior thread history (context window grows).
Agent must:
1. Multi-label classify the current message
2. Account for cultural/regional context (author_region, language)
3. Resolve conflicting policies (e.g. political speech vs hate speech)
4. Decide action for the THREAD as a whole at the final message
Grader checks:
- Per-message label accuracy (partial credit)
- Thread-level action accuracy (full credit)
- Reasoning quality on conflicting policy cases (scored by keyword check)
- False positive rate on political speech protected content
"""
from __future__ import annotations
from models import ContentItem, ModerationObservation
from server.dataset import get_whatsapp_threads, get_community_standards
TASK_NAME = "thread-moderation-hard"
MAX_STEPS = 15
INSTRUCTIONS = (
"You are a senior content moderator handling a WhatsApp message thread. "
"You see messages one at a time, with full prior context. "
"For EACH message: assign all applicable violation labels, choose an action. "
"For the FINAL message in a thread: also decide the thread-level action. "
"IMPORTANT: Political speech is protected under CS_POLITICAL_001 even if it sounds harsh. "
"Hate speech that uses political framing is NOT protected — use your judgment. "
"When policies conflict, cite both rule IDs in policy_citations and explain in reasoning. "
"Respond with a JSON ModerationDecision."
)
def build_episode(seed: int = 42) -> list[tuple[ContentItem, list[ContentItem], list[str]]]:
"""
Returns list of (current_message, thread_history_so_far, conflicting_policy_ids).
One entry per step across all threads in the episode.
"""
threads = get_whatsapp_threads(seed)[:3] # 3 threads, ~5 messages each = 15 steps
steps = []
policies = get_community_standards()
conflict_policy_ids = ["CS_HATE_001", "CS_POLITICAL_001"]
for thread in threads:
messages = thread["messages"]
history = []
for msg in messages:
item = ContentItem(**{k: v for k, v in msg.items() if k != "position"})
conflicts = conflict_policy_ids if thread.get("difficulty") == "hard" else []
steps.append((item, list(history), conflicts))
history.append(item)
return steps[:MAX_STEPS]
def build_observation(
step: int,
item: ContentItem,
history: list[ContentItem],
conflicting_policies: list[str],
) -> ModerationObservation:
policies = get_community_standards()
policy_text = "\n".join(
f"[{p['id']}] {p['name']}: {p['description']}"
for p in policies["policies"]
)
return ModerationObservation(
step=step,
content_item=item,
policy_excerpt=policy_text,
thread_history=history,
conflicting_policies=conflicting_policies,
task_name=TASK_NAME,
instructions=INSTRUCTIONS,
)
|