File size: 1,996 Bytes
0ee3210 1b50e57 0ee3210 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 | # app/tasks/task_multi_label.py
"""
Task: multi-label-classify
Difficulty: Medium
Content types: text_post, ad_copy, whatsapp_message
Steps per episode: 12
Description:
Agent receives content that may have 1, 2, or 3 violations simultaneously.
Must return ALL applicable labels (or [CLEAN]).
Grader uses precision/recall — partial credit for partial label matches.
Agent is penalized more for false positives than false negatives.
"""
from __future__ import annotations
from models import ContentItem, ModerationObservation
from server.dataset import get_posts, get_ad_copies, get_community_standards, get_policy_excerpt
TASK_NAME = "multi-label-classify"
MAX_STEPS = 12
INSTRUCTIONS = (
"You are an advanced content moderator. Each item may violate MULTIPLE policies simultaneously. "
"Assign ALL applicable violation categories. If the content is clean, return only [CLEAN]. "
"Provide your confidence score and briefly cite which policy rule applies. "
"Respond with a JSON ModerationDecision."
)
def build_episode(seed: int = 42) -> list[ContentItem]:
"""Episode focuses on multi-label and borderline items."""
posts = [p for p in get_posts(seed) if len(p.get("ground_truth_labels", [])) > 1 or
p.get("difficulty") in ("medium", "hard")]
ads = get_ad_copies(seed)[:4]
all_items = (posts + ads)[:MAX_STEPS]
return [ContentItem(**{k: v for k, v in item.items()
if k not in ("ground_truth_labels", "ground_truth_action",
"difficulty", "ad_targeting", "violated_policies")})
for item in all_items]
def build_observation(step: int, item: ContentItem) -> ModerationObservation:
policies = get_community_standards()
return ModerationObservation(
step=step,
content_item=item,
policy_excerpt=get_policy_excerpt(item.content_type.value, policies),
task_name=TASK_NAME,
instructions=INSTRUCTIONS,
)
|