name: content-moderation-queue
version: "1.0.0"
description: >
  A real-world content moderation environment where AI agents learn to triage
  social media posts using a tiered policy framework. Agents moderate a queue
  of posts — classifying violations, applying graduated consequences, handling
  appeals, and escalating crisis content. Simulates the daily work of a Trust
  & Safety moderation team.

author: content-moderation-env
tags:
  - openenv
  - content-moderation
  - trust-and-safety
  - nlp
  - real-world
  - policy-enforcement

observation_space:
  type: object
  description: >
    Current post to moderate plus queue/context state.
  properties:
    post_id:         { type: string }
    content:         { type: string, description: "Full text of the post" }
    author_id:       { type: string }
    author_history:
      type: object
      description: "Author's prior violations count and action history"
    context_thread:
      type: array
      description: "Parent/sibling posts that provide context (if any)"
    is_appeal:       { type: boolean }
    appeal_reason:   { type: string, nullable: true }
    queue_remaining: { type: array, description: "Preview of posts still in queue" }
    queue_position:  { type: integer }
    queue_total:     { type: integer }
    current_step:    { type: integer }
    max_steps:       { type: integer }
    available_actions: { type: array, items: { type: string } }
    cumulative_reward: { type: number }

action_space:
  type: object
  description: >
    Moderation decision for the current post.
  properties:
    action_type:
      type: string
      enum: [approve, warn, remove, ban_temporary, ban_permanent, escalate]
      description: "The moderation action to apply"
    violation_type:
      type: string
      enum: [spam, hate_speech, harassment, misinformation, csam, illegal_services, doxxing, self_harm_risk, none]
      description: "Optional — identified policy violation category"
    reasoning:
      type: string
      description: "Optional — agent's explanation (logged only, not graded)"

reward:
  type: float
  range: [0.0, 1.0]
  description: >
    Per-step reward signal. Non-sparse: each post scores independently.
    Partial credit awarded for being one level off on the severity ladder.
    Episode score = mean of all per-post scores.

tasks:
  - id: task_easy
    name: "Binary Content Moderation"
    difficulty: easy
    num_posts: 8
    max_steps: 12
    description: >
      Classify 8 posts as APPROVE or REMOVE. Posts contain clear signals —
      obvious spam, explicit hate speech, direct threats versus everyday benign content.
    allowed_actions: [approve, remove]
    scoring: "Binary exact match. Score = correct / total."

  - id: task_medium
    name: "Tiered Policy Enforcement"
    difficulty: medium
    num_posts: 10
    max_steps: 18
    description: >
      Apply 5-level policy to 10 posts including edge cases: harsh criticism,
      repeat offenders, political content, first vs habitual violations.
      Partial credit for being one level off the severity ladder.
    allowed_actions: [approve, warn, remove, ban_temporary, ban_permanent]
    scoring: "Action distance score (70%) + violation type (30%)."

  - id: task_hard
    name: "Full Queue Management with Context and Appeals"
    difficulty: hard
    num_posts: 12
    max_steps: 24
    description: >
      Handle 12 items including appeals requiring evidence review, ambiguous
      context (gaming slang, creative writing, recovery stories), repeat offenders,
      crisis escalations, and second-appeal denials. Correct answer often
      requires reasoning over author history and thread context.
    allowed_actions: [approve, warn, remove, ban_temporary, ban_permanent, escalate]
    scoring: "Action score (50%) + context-aware bonus (30%) + violation type (20%)."

endpoints:
  - method: GET
    path: /health
    description: Liveness check — returns 200 when server is ready
  - method: GET
    path: /tasks
    description: List all available tasks with metadata
  - method: POST
    path: /reset
    query_params:
      task_id: {type: string, default: task_easy, enum: [task_easy, task_medium, task_hard]}
    response_model: Observation
    description: Start a new episode; returns first Observation
  - method: POST
    path: /step
    request_body: Action
    response_model: StepResult
    description: Submit moderation action for current post; returns StepResult
  - method: GET
    path: /state
    response_model: EnvironmentState
    description: Full snapshot of current environment state

entry_point: app:app
framework: fastapi
python_version: "3.10"