File size: 3,181 Bytes
0ee3210
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b50e57
0ee3210
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# app/tasks/task_thread_hard.py
"""
Task: thread-moderation-hard
Difficulty: Hard
Content types: whatsapp_message (full threads)
Steps per episode: 15
Description:
    This is the hardest task. Per step, the agent sees ONE message in a thread
    PLUS the full prior thread history (context window grows).
    Agent must:
    1. Multi-label classify the current message
    2. Account for cultural/regional context (author_region, language)
    3. Resolve conflicting policies (e.g. political speech vs hate speech)
    4. Decide action for the THREAD as a whole at the final message
    Grader checks:
    - Per-message label accuracy (partial credit)
    - Thread-level action accuracy (full credit)
    - Reasoning quality on conflicting policy cases (scored by keyword check)
    - False positive rate on political speech protected content
"""
from __future__ import annotations
from models import ContentItem, ModerationObservation
from server.dataset import get_whatsapp_threads, get_community_standards

TASK_NAME = "thread-moderation-hard"
MAX_STEPS = 15
INSTRUCTIONS = (
    "You are a senior content moderator handling a WhatsApp message thread. "
    "You see messages one at a time, with full prior context. "
    "For EACH message: assign all applicable violation labels, choose an action. "
    "For the FINAL message in a thread: also decide the thread-level action. "
    "IMPORTANT: Political speech is protected under CS_POLITICAL_001 even if it sounds harsh. "
    "Hate speech that uses political framing is NOT protected — use your judgment. "
    "When policies conflict, cite both rule IDs in policy_citations and explain in reasoning. "
    "Respond with a JSON ModerationDecision."
)


def build_episode(seed: int = 42) -> list[tuple[ContentItem, list[ContentItem], list[str]]]:
    """
    Returns list of (current_message, thread_history_so_far, conflicting_policy_ids).
    One entry per step across all threads in the episode.
    """
    threads = get_whatsapp_threads(seed)[:3]  # 3 threads, ~5 messages each = 15 steps
    steps = []
    policies = get_community_standards()
    conflict_policy_ids = ["CS_HATE_001", "CS_POLITICAL_001"]

    for thread in threads:
        messages = thread["messages"]
        history = []
        for msg in messages:
            item = ContentItem(**{k: v for k, v in msg.items() if k != "position"})
            conflicts = conflict_policy_ids if thread.get("difficulty") == "hard" else []
            steps.append((item, list(history), conflicts))
            history.append(item)

    return steps[:MAX_STEPS]


def build_observation(
    step: int,
    item: ContentItem,
    history: list[ContentItem],
    conflicting_policies: list[str],
) -> ModerationObservation:
    policies = get_community_standards()
    policy_text = "\n".join(
        f"[{p['id']}] {p['name']}: {p['description']}"
        for p in policies["policies"]
    )
    return ModerationObservation(
        step=step,
        content_item=item,
        policy_excerpt=policy_text,
        thread_history=history,
        conflicting_policies=conflicting_policies,
        task_name=TASK_NAME,
        instructions=INSTRUCTIONS,
    )