Pratap-K's picture
minor fix
610fff2
# openenv.yaml
name: meta-content-moderation-env
version: "0.1.0"
description: >
OpenEnv environment simulating real-world AI content moderation at Meta scale.
Agents must classify, label, and take enforcement actions on text posts,
image descriptions, ad copy, and WhatsApp-style messages across 4 tasks
ranging from easy single-label classification to hard multi-policy thread moderation.
tags:
- openenv
- content-moderation
- nlp
- meta
- social-media
- safety
author: "Team Neuron"
license: MIT
environment:
class: MetaContentModerationEnv
module: server.env
entry_point: server.app:app
graders:
single-label-classify: "server.graders:single_label_entry"
multi-label-classify: "server.graders:multi_label_entry"
ad-policy-compliance: "server.graders:ad_policy_entry"
thread-moderation-hard: "server.graders:thread_hard_entry"
tasks:
- id: single-label-classify
name: single-label-classify
difficulty: easy
description: Classify a single content item into one violation category or CLEAN
grader: "server.graders:single_label_entry"
grader_import: "server.graders.single_label_entry"
- id: multi-label-classify
name: multi-label-classify
difficulty: medium
description: Assign all applicable violation labels to content that may violate multiple policies
grader: "server.graders:multi_label_entry"
grader_import: "server.graders.multi_label_entry"
- id: ad-policy-compliance
name: ad-policy-compliance
difficulty: medium_hard
description: Review ad copy against ad policies, identify violations, and cite specific rule IDs
grader: "server.graders:ad_policy_entry"
grader_import: "server.graders.ad_policy_entry"
- id: thread-moderation-hard
name: thread-moderation-hard
difficulty: hard
description: >
Moderate a full WhatsApp conversation thread with growing context window.
Handles cultural nuance, multi-label violations, and conflicting policy resolution.
grader: "server.graders:thread_hard_entry"
grader_import: "server.graders.thread_hard_entry"
observation_space:
type: object
fields:
step: integer
content_item:
content_id: string
content_type: enum [text_post, image_description, ad_copy, whatsapp_message]
text: string
author_region: string
language: string
author_history: list[string]
media_urls: list[string]
media_types: list[string]
policy_excerpt: string
thread_history: list[ContentItem]
conflicting_policies: list[string]
task_name: string
instructions: string
action_space:
type: object
fields:
content_id: string
labels: list[enum[hate_speech, violence, nudity, spam, misinformation, harassment, self_harm, terrorism, scam, clean]]
action: enum [approve, remove, escalate, restrict, request_review]
confidence: float [0.0, 1.0]
reasoning: string
policy_citations: list[string]
reward:
range: [-1.0, 1.0]
shape: partial_credit
components:
- label_precision
- label_recall
- action_correct
- policy_citation_score
- false_positive_penalty
- reasoning_quality
api:
reset: POST /reset
step: POST /step
state: GET /state
health: GET /health
server:
host: "0.0.0.0"
port: 7860
framework: fastapi