Spaces:

Pratap-K
/

meta-content-moderation-env

Sleeping

App Files Files Community

meta-content-moderation-env / server /tasks /task_single_label.py

Pratap-K

Openenv Env Setup

1b50e57 27 days ago

raw

history blame contribute delete

1.75 kB

	# app/tasks/task_single_label.py
	"""
	Task: single-label-classify
	Difficulty: Easy
	Content types: text_post, image_description
	Steps per episode: 10
	Description:
	Agent receives one content item per step.
	Must return exactly ONE ViolationCategory label.
	Action must be one of: approve (if CLEAN) or remove/restrict.
	"""
	from __future__ import annotations
	from models import (
	ContentItem, ModerationObservation, ContentType
	)
	from server.dataset import get_posts, get_image_descriptions, get_policy_excerpt, get_community_standards

	TASK_NAME = "single-label-classify"
	MAX_STEPS = 10
	INSTRUCTIONS = (
	"You are a content moderator. For each item, assign EXACTLY ONE violation category "
	"from the list, or mark it as CLEAN if no violation exists. "
	"Then choose the appropriate enforcement action: approve (clean), remove (clear violation), "
	"restrict (borderline), or escalate (unclear). "
	"Respond with a JSON ModerationDecision."
	)


	def build_episode(seed: int = 42) -> list[ContentItem]:
	"""Return a shuffled mix of posts and image descriptions for one episode."""
	posts = get_posts(seed)[:7]
	images = get_image_descriptions(seed)[:3]
	items = posts + images
	return [ContentItem(**{k: v for k, v in item.items()
	if k not in ("ground_truth_labels", "ground_truth_action", "difficulty")})
	for item in items]


	def build_observation(step: int, item: ContentItem) -> ModerationObservation:
	policies = get_community_standards()
	return ModerationObservation(
	step=step,
	content_item=item,
	policy_excerpt=get_policy_excerpt(item.content_type.value, policies),
	task_name=TASK_NAME,
	instructions=INSTRUCTIONS,
	)