# openenv.yaml name: meta-content-moderation-env version: "0.1.0" description: > OpenEnv environment simulating real-world AI content moderation at Meta scale. Agents must classify, label, and take enforcement actions on text posts, image descriptions, ad copy, and WhatsApp-style messages across 4 tasks ranging from easy single-label classification to hard multi-policy thread moderation. tags: - openenv - content-moderation - nlp - meta - social-media - safety author: "Team Neuron" license: MIT environment: class: MetaContentModerationEnv module: server.env entry_point: server.app:app graders: single-label-classify: "server.graders:single_label_entry" multi-label-classify: "server.graders:multi_label_entry" ad-policy-compliance: "server.graders:ad_policy_entry" thread-moderation-hard: "server.graders:thread_hard_entry" tasks: - id: single-label-classify name: single-label-classify difficulty: easy description: Classify a single content item into one violation category or CLEAN grader: "server.graders:single_label_entry" grader_import: "server.graders.single_label_entry" - id: multi-label-classify name: multi-label-classify difficulty: medium description: Assign all applicable violation labels to content that may violate multiple policies grader: "server.graders:multi_label_entry" grader_import: "server.graders.multi_label_entry" - id: ad-policy-compliance name: ad-policy-compliance difficulty: medium_hard description: Review ad copy against ad policies, identify violations, and cite specific rule IDs grader: "server.graders:ad_policy_entry" grader_import: "server.graders.ad_policy_entry" - id: thread-moderation-hard name: thread-moderation-hard difficulty: hard description: > Moderate a full WhatsApp conversation thread with growing context window. Handles cultural nuance, multi-label violations, and conflicting policy resolution. grader: "server.graders:thread_hard_entry" grader_import: "server.graders.thread_hard_entry" observation_space: type: object fields: step: integer content_item: content_id: string content_type: enum [text_post, image_description, ad_copy, whatsapp_message] text: string author_region: string language: string author_history: list[string] media_urls: list[string] media_types: list[string] policy_excerpt: string thread_history: list[ContentItem] conflicting_policies: list[string] task_name: string instructions: string action_space: type: object fields: content_id: string labels: list[enum[hate_speech, violence, nudity, spam, misinformation, harassment, self_harm, terrorism, scam, clean]] action: enum [approve, remove, escalate, restrict, request_review] confidence: float [0.0, 1.0] reasoning: string policy_citations: list[string] reward: range: [-1.0, 1.0] shape: partial_credit components: - label_precision - label_recall - action_correct - policy_citation_score - false_positive_penalty - reasoning_quality api: reset: POST /reset step: POST /step state: GET /state health: GET /health server: host: "0.0.0.0" port: 7860 framework: fastapi