| |
| name: meta-content-moderation-env |
| version: "0.1.0" |
| description: > |
| OpenEnv environment simulating real-world AI content moderation at Meta scale. |
| Agents must classify, label, and take enforcement actions on text posts, |
| image descriptions, ad copy, and WhatsApp-style messages across 4 tasks |
| ranging from easy single-label classification to hard multi-policy thread moderation. |
| |
| tags: |
| - openenv |
| - content-moderation |
| - nlp |
| - meta |
| - social-media |
| - safety |
|
|
| author: "Team Neuron" |
| license: MIT |
|
|
| environment: |
| class: MetaContentModerationEnv |
| module: server.env |
| entry_point: server.app:app |
|
|
| graders: |
| single-label-classify: "server.graders:single_label_entry" |
| multi-label-classify: "server.graders:multi_label_entry" |
| ad-policy-compliance: "server.graders:ad_policy_entry" |
| thread-moderation-hard: "server.graders:thread_hard_entry" |
|
|
| tasks: |
| - id: single-label-classify |
| name: single-label-classify |
| difficulty: easy |
| description: Classify a single content item into one violation category or CLEAN |
| grader: "server.graders:single_label_entry" |
| grader_import: "server.graders.single_label_entry" |
|
|
| - id: multi-label-classify |
| name: multi-label-classify |
| difficulty: medium |
| description: Assign all applicable violation labels to content that may violate multiple policies |
| grader: "server.graders:multi_label_entry" |
| grader_import: "server.graders.multi_label_entry" |
|
|
| - id: ad-policy-compliance |
| name: ad-policy-compliance |
| difficulty: medium_hard |
| description: Review ad copy against ad policies, identify violations, and cite specific rule IDs |
| grader: "server.graders:ad_policy_entry" |
| grader_import: "server.graders.ad_policy_entry" |
|
|
| - id: thread-moderation-hard |
| name: thread-moderation-hard |
| difficulty: hard |
| description: > |
| Moderate a full WhatsApp conversation thread with growing context window. |
| Handles cultural nuance, multi-label violations, and conflicting policy resolution. |
| grader: "server.graders:thread_hard_entry" |
| grader_import: "server.graders.thread_hard_entry" |
|
|
| observation_space: |
| type: object |
| fields: |
| step: integer |
| content_item: |
| content_id: string |
| content_type: enum [text_post, image_description, ad_copy, whatsapp_message] |
| text: string |
| author_region: string |
| language: string |
| author_history: list[string] |
| media_urls: list[string] |
| media_types: list[string] |
| policy_excerpt: string |
| thread_history: list[ContentItem] |
| conflicting_policies: list[string] |
| task_name: string |
| instructions: string |
|
|
| action_space: |
| type: object |
| fields: |
| content_id: string |
| labels: list[enum[hate_speech, violence, nudity, spam, misinformation, harassment, self_harm, terrorism, scam, clean]] |
| action: enum [approve, remove, escalate, restrict, request_review] |
| confidence: float [0.0, 1.0] |
| reasoning: string |
| policy_citations: list[string] |
|
|
| reward: |
| range: [-1.0, 1.0] |
| shape: partial_credit |
| components: |
| - label_precision |
| - label_recall |
| - action_correct |
| - policy_citation_score |
| - false_positive_penalty |
| - reasoning_quality |
|
|
| api: |
| reset: POST /reset |
| step: POST /step |
| state: GET /state |
| health: GET /health |
|
|
| server: |
| host: "0.0.0.0" |
| port: 7860 |
| framework: fastapi |
|
|