| |
| """ |
| Task: multi-label-classify |
| Difficulty: Medium |
| Content types: text_post, ad_copy, whatsapp_message |
| Steps per episode: 12 |
| Description: |
| Agent receives content that may have 1, 2, or 3 violations simultaneously. |
| Must return ALL applicable labels (or [CLEAN]). |
| Grader uses precision/recall — partial credit for partial label matches. |
| Agent is penalized more for false positives than false negatives. |
| """ |
| from __future__ import annotations |
| from models import ContentItem, ModerationObservation |
| from server.dataset import get_posts, get_ad_copies, get_community_standards, get_policy_excerpt |
|
|
| TASK_NAME = "multi-label-classify" |
| MAX_STEPS = 12 |
| INSTRUCTIONS = ( |
| "You are an advanced content moderator. Each item may violate MULTIPLE policies simultaneously. " |
| "Assign ALL applicable violation categories. If the content is clean, return only [CLEAN]. " |
| "Provide your confidence score and briefly cite which policy rule applies. " |
| "Respond with a JSON ModerationDecision." |
| ) |
|
|
|
|
| def build_episode(seed: int = 42) -> list[ContentItem]: |
| """Episode focuses on multi-label and borderline items.""" |
| posts = [p for p in get_posts(seed) if len(p.get("ground_truth_labels", [])) > 1 or |
| p.get("difficulty") in ("medium", "hard")] |
| ads = get_ad_copies(seed)[:4] |
| all_items = (posts + ads)[:MAX_STEPS] |
| return [ContentItem(**{k: v for k, v in item.items() |
| if k not in ("ground_truth_labels", "ground_truth_action", |
| "difficulty", "ad_targeting", "violated_policies")}) |
| for item in all_items] |
|
|
|
|
| def build_observation(step: int, item: ContentItem) -> ModerationObservation: |
| policies = get_community_standards() |
| return ModerationObservation( |
| step=step, |
| content_item=item, |
| policy_excerpt=get_policy_excerpt(item.content_type.value, policies), |
| task_name=TASK_NAME, |
| instructions=INSTRUCTIONS, |
| ) |
|
|