Spaces:

sniki28
/

content-moderation-queue

Sleeping

App Files Files Community

content-moderation-queue / openenv.yaml

sniki28

Upload openenv.yaml with huggingface_hub

f91dadc verified 12 days ago

raw

history blame contribute delete

4.68 kB

	name: content-moderation-queue
	version: "1.0.0"
	description: >
	A real-world content moderation environment where AI agents learn to triage
	social media posts using a tiered policy framework. Agents moderate a queue
	of posts — classifying violations, applying graduated consequences, handling
	appeals, and escalating crisis content. Simulates the daily work of a Trust
	& Safety moderation team.

	author: content-moderation-env
	tags:
	- openenv
	- content-moderation
	- trust-and-safety
	- nlp
	- real-world
	- policy-enforcement

	observation_space:
	type: object
	description: >
	Current post to moderate plus queue/context state.
	properties:
	post_id: { type: string }
	content: { type: string, description: "Full text of the post" }
	author_id: { type: string }
	author_history:
	type: object
	description: "Author's prior violations count and action history"
	context_thread:
	type: array
	description: "Parent/sibling posts that provide context (if any)"
	is_appeal: { type: boolean }
	appeal_reason: { type: string, nullable: true }
	queue_remaining: { type: array, description: "Preview of posts still in queue" }
	queue_position: { type: integer }
	queue_total: { type: integer }
	current_step: { type: integer }
	max_steps: { type: integer }
	available_actions: { type: array, items: { type: string } }
	cumulative_reward: { type: number }

	action_space:
	type: object
	description: >
	Moderation decision for the current post.
	properties:
	action_type:
	type: string
	enum: [approve, warn, remove, ban_temporary, ban_permanent, escalate]
	description: "The moderation action to apply"
	violation_type:
	type: string
	enum: [spam, hate_speech, harassment, misinformation, csam, illegal_services, doxxing, self_harm_risk, none]
	description: "Optional — identified policy violation category"
	reasoning:
	type: string
	description: "Optional — agent's explanation (logged only, not graded)"

	reward:
	type: float
	range: [0.0, 1.0]
	description: >
	Per-step reward signal. Non-sparse: each post scores independently.
	Partial credit awarded for being one level off on the severity ladder.
	Episode score = mean of all per-post scores.

	tasks:
	- id: task_easy
	name: "Binary Content Moderation"
	difficulty: easy
	num_posts: 8
	max_steps: 12
	description: >
	Classify 8 posts as APPROVE or REMOVE. Posts contain clear signals —
	obvious spam, explicit hate speech, direct threats versus everyday benign content.
	allowed_actions: [approve, remove]
	scoring: "Binary exact match. Score = correct / total."

	- id: task_medium
	name: "Tiered Policy Enforcement"
	difficulty: medium
	num_posts: 10
	max_steps: 18
	description: >
	Apply 5-level policy to 10 posts including edge cases: harsh criticism,
	repeat offenders, political content, first vs habitual violations.
	Partial credit for being one level off the severity ladder.
	allowed_actions: [approve, warn, remove, ban_temporary, ban_permanent]
	scoring: "Action distance score (70%) + violation type (30%)."

	- id: task_hard
	name: "Full Queue Management with Context and Appeals"
	difficulty: hard
	num_posts: 12
	max_steps: 24
	description: >
	Handle 12 items including appeals requiring evidence review, ambiguous
	context (gaming slang, creative writing, recovery stories), repeat offenders,
	crisis escalations, and second-appeal denials. Correct answer often
	requires reasoning over author history and thread context.
	allowed_actions: [approve, warn, remove, ban_temporary, ban_permanent, escalate]
	scoring: "Action score (50%) + context-aware bonus (30%) + violation type (20%)."

	endpoints:
	- method: GET
	path: /health
	description: Liveness check — returns 200 when server is ready
	- method: GET
	path: /tasks
	description: List all available tasks with metadata
	- method: POST
	path: /reset
	query_params:
	task_id: {type: string, default: task_easy, enum: [task_easy, task_medium, task_hard]}
	response_model: Observation
	description: Start a new episode; returns first Observation
	- method: POST
	path: /step
	request_body: Action
	response_model: StepResult
	description: Submit moderation action for current post; returns StepResult
	- method: GET
	path: /state
	response_model: EnvironmentState
	description: Full snapshot of current environment state

	entry_point: app:app
	framework: fastapi
	python_version: "3.10"