content-moderation-queue / openenv.yaml
sniki28's picture
Upload openenv.yaml with huggingface_hub
f91dadc verified
name: content-moderation-queue
version: "1.0.0"
description: >
A real-world content moderation environment where AI agents learn to triage
social media posts using a tiered policy framework. Agents moderate a queue
of posts — classifying violations, applying graduated consequences, handling
appeals, and escalating crisis content. Simulates the daily work of a Trust
& Safety moderation team.
author: content-moderation-env
tags:
- openenv
- content-moderation
- trust-and-safety
- nlp
- real-world
- policy-enforcement
observation_space:
type: object
description: >
Current post to moderate plus queue/context state.
properties:
post_id: { type: string }
content: { type: string, description: "Full text of the post" }
author_id: { type: string }
author_history:
type: object
description: "Author's prior violations count and action history"
context_thread:
type: array
description: "Parent/sibling posts that provide context (if any)"
is_appeal: { type: boolean }
appeal_reason: { type: string, nullable: true }
queue_remaining: { type: array, description: "Preview of posts still in queue" }
queue_position: { type: integer }
queue_total: { type: integer }
current_step: { type: integer }
max_steps: { type: integer }
available_actions: { type: array, items: { type: string } }
cumulative_reward: { type: number }
action_space:
type: object
description: >
Moderation decision for the current post.
properties:
action_type:
type: string
enum: [approve, warn, remove, ban_temporary, ban_permanent, escalate]
description: "The moderation action to apply"
violation_type:
type: string
enum: [spam, hate_speech, harassment, misinformation, csam, illegal_services, doxxing, self_harm_risk, none]
description: "Optional — identified policy violation category"
reasoning:
type: string
description: "Optional — agent's explanation (logged only, not graded)"
reward:
type: float
range: [0.0, 1.0]
description: >
Per-step reward signal. Non-sparse: each post scores independently.
Partial credit awarded for being one level off on the severity ladder.
Episode score = mean of all per-post scores.
tasks:
- id: task_easy
name: "Binary Content Moderation"
difficulty: easy
num_posts: 8
max_steps: 12
description: >
Classify 8 posts as APPROVE or REMOVE. Posts contain clear signals —
obvious spam, explicit hate speech, direct threats versus everyday benign content.
allowed_actions: [approve, remove]
scoring: "Binary exact match. Score = correct / total."
- id: task_medium
name: "Tiered Policy Enforcement"
difficulty: medium
num_posts: 10
max_steps: 18
description: >
Apply 5-level policy to 10 posts including edge cases: harsh criticism,
repeat offenders, political content, first vs habitual violations.
Partial credit for being one level off the severity ladder.
allowed_actions: [approve, warn, remove, ban_temporary, ban_permanent]
scoring: "Action distance score (70%) + violation type (30%)."
- id: task_hard
name: "Full Queue Management with Context and Appeals"
difficulty: hard
num_posts: 12
max_steps: 24
description: >
Handle 12 items including appeals requiring evidence review, ambiguous
context (gaming slang, creative writing, recovery stories), repeat offenders,
crisis escalations, and second-appeal denials. Correct answer often
requires reasoning over author history and thread context.
allowed_actions: [approve, warn, remove, ban_temporary, ban_permanent, escalate]
scoring: "Action score (50%) + context-aware bonus (30%) + violation type (20%)."
endpoints:
- method: GET
path: /health
description: Liveness check returns 200 when server is ready
- method: GET
path: /tasks
description: List all available tasks with metadata
- method: POST
path: /reset
query_params:
task_id: {type: string, default: task_easy, enum: [task_easy, task_medium, task_hard]}
response_model: Observation
description: Start a new episode; returns first Observation
- method: POST
path: /step
request_body: Action
response_model: StepResult
description: Submit moderation action for current post; returns StepResult
- method: GET
path: /state
response_model: EnvironmentState
description: Full snapshot of current environment state
entry_point: app:app
framework: fastapi
python_version: "3.10"