TASKS = [ { "task_id": "T-001", "difficulty": "easy", "ambiguity_level": "low", "risk_level": "low", "post_text": ( "🚨 URGENT: Your account has been COMPROMISED! " "Click here immediately: http://meta-security-verify-now.xyz/login?token=abc123 " "Act within 24 hours or your account will be PERMANENTLY BANNED. — Meta Security Team" ), "image_description": ( "Poorly formatted alert page with mismatched Meta logos, " "red urgent warning banners, and a suspicious external URL." ), "tool_responses": { "read_comments": "Comments disabled. Zero engagement.", "check_user_history": "Account created 2 days ago. Zero posts. Flagged by spam filter 3×.", "check_entity_status": "Domain 'meta-security-verify-now.xyz' registered 1 day ago — confirmed phishing domain.", "view_policy": "Policy 3.2 — Phishing: Impersonating Meta to harvest credentials = immediate REMOVE.", }, "required_tools": [], "ground_truth": "REMOVE", "ground_truth_signals": { "target": "individual", "is_protected_class": False, "toxicity_level": 0.90, "is_direct_attack": True, "context_type": "spam", "intent": "harmful", "abusive_language_present": False, "content_flags": ["phishing", "spam", "impersonation"], }, }, { "task_id": "T-002", "difficulty": "medium", "ambiguity_level": "medium", "risk_level": "low", "post_text": ( "LMAO I will literally SNIPE you the second the game starts, " "no mercy, you're getting absolutely destroyed 💀🎮 " "1v1 me and I'll end your whole career bro. Your team is DONE." ), "image_description": ( "Call of Duty: Warzone — squad wipe in progress. " "Kill notifications, sniper rifle, enemy team eliminated. Classic gaming screenshot." ), "tool_responses": { "read_comments": ( "47 comments: 'GG bro!', 'You always trash talk before CoD 😂', " "'rematch tonight??', 'your aim is mid bro'. " "All mutual gaming friends. Clear competitive banter context." ), "check_user_history": ( "Account 3 years active. 500+ posts, 90% gaming. " "Zero harassment reports. Regular CoD tournament player." ), "check_entity_status": "Private individual. Standard gaming community member.", "view_policy": ( "Policy 7.1 — Harassment: Hyperbolic language in established gaming communities " "is NOT harassment. 'Snipe', 'destroy', 'end your career' are non-literal expressions." ), }, "required_tools": ["read_comments"], "ground_truth": "ALLOW", "ground_truth_signals": { "target": "individual", "is_protected_class": False, "toxicity_level": 0.30, "is_direct_attack": False, "context_type": "meme", "intent": "ambiguous", "abusive_language_present": False, "content_flags": ["competitive_language"], }, }, { "task_id": "T-003", "difficulty": "hard", "ambiguity_level": "high", "risk_level": "high", "post_text": ( "Sharing this political cartoon I drew. " "Mayor Davidson cutting the homeless shelter budget while sitting on a throne of gold coins. " "Caption: 'Priorities.' #LocalPolitics #HoldThemAccountable #SatiricalArt #MayorDavidson" ), "image_description": ( "Hand-drawn political cartoon. Exaggerated caricature of a public official on a coin throne, " "homeless families pushed away. Deliberately satirical editorial style. Artist signature visible." ), "tool_responses": { "read_comments": ( "23 comments: 'Powerful commentary', 'Free speech!', 'Great art', 'Defamation!'. " "No threats or coordinated harassment detected." ), "check_user_history": ( "Account 5 years. Verified local journalist and artist. " "Regular political commentary. No violations. Posts cited in local news." ), "check_entity_status": ( "Mayor Davidson: CONFIRMED PUBLIC FIGURE. " "Elected Mayor of Springfield. Active public interest in recent social services budget cuts. " "Multiple credible news articles confirm the cuts are real and newsworthy." ), "view_policy": ( "Policy 12.4 — Political Satire: Editorial cartoons targeting PUBLIC OFFICIALS " "on PUBLIC INTEREST matters are PROTECTED. Caricature is a recognized technique. " "Not realistic impersonation. Correct action: ALLOW_WITH_WARNING." ), }, "required_tools": ["check_entity_status", "view_policy"], "ground_truth": "ALLOW_WITH_WARNING", "ground_truth_signals": { "target": "political", "is_protected_class": False, "toxicity_level": 0.20, "is_direct_attack": False, "context_type": "satire", "intent": "satirical", "abusive_language_present": False, "content_flags": ["political_content", "public_figure"], }, }, # ── Append these to the TASKS list ────────────────────────────────────────── { "task_id": "T-004", "difficulty": "medium", "ambiguity_level": "medium", "risk_level": "high", "post_text": ( "Sharing this 'educational' video about why [ethnic group] people " "are genetically predisposed to criminal behaviour. " "The science doesn't lie. Share before it gets deleted. 👀 " "#HiddenTruths #CensoredFacts #JustAsking" ), "image_description": ( "Screenshot of a YouTube-style video thumbnail with a white lab coat figure " "and a graph. Title: 'The Uncomfortable Biology of Crime Rates'. " "Uploader has 0 verified credentials. Comments disabled." ), "tool_responses": { "read_comments": ( "Comments disabled by uploader. " "Zero engagement from credible academic or scientific accounts." ), "check_user_history": ( "Account 6 months old. 48 posts — all promoting pseudoscientific race content. " "Previously received 2 policy warnings for hate content. " "No verified credentials. Frequently uses 'just asking questions' framing." ), "check_entity_status": ( "Ethnic group mentioned: confirmed PROTECTED CLASS under platform policy. " "Claim about genetic criminality: DEBUNKED by mainstream biology and criminology. " "This framing is a known pseudoscience hate vector (scientific racism)." ), "view_policy": ( "Policy 4.3 — Hate Speech: Content attributing criminal or negative traits " "to a protected class based on race or ethnicity = REMOVE. " "Disguising hate speech as 'educational content' or 'science' does not " "change the violation. Action: REMOVE." ), }, "required_tools": ["check_entity_status", "view_policy"], "ground_truth": "REMOVE", "ground_truth_signals": { "target": "group", "is_protected_class": True, "toxicity_level": 0.85, "is_direct_attack": True, "context_type": "statement", "intent": "harmful", "abusive_language_present": False, "content_flags": ["slur", "misinformation", "hate_speech_disguised_as_education"], }, }, { "task_id": "T-005", "difficulty": "hard", "ambiguity_level": "high", "risk_level": "high", "post_text": ( "The Prime Minister announced new economic reforms today. " "Opposition leaders called it 'economic warfare on the poor.' " "Protests erupted in 3 cities. Security forces used tear gas. " "Government says the crackdown was 'necessary to restore order.' " "At least 12 protesters arrested. #Politics #Protest #Reform" ), "image_description": ( "News-style post with a photo of a large protest crowd holding signs. " "Tear gas clouds visible in background. " "Caption: 'Thousands take to streets against new austerity measures.' " "Shared by a verified news outlet handle. No graphic violence." ), "tool_responses": { "read_comments": ( "340 comments split sharply: " "'State violence against citizens!', 'Finally someone fixing the economy', " "'This is propaganda!', 'Biased reporting!', 'Necessary action'. " "High engagement, deep political polarization. No incitement to violence detected." ), "check_user_history": ( "Posted by @NationalNewsNetwork — verified news organization. " "12 years active. Consistently neutral coverage. " "No policy violations. Registered press entity." ), "check_entity_status": ( "Prime Minister: CONFIRMED PUBLIC FIGURE, head of government. " "Economic reforms: CONFIRMED real policy, actively debated in parliament. " "Protest: CONFIRMED real event, covered by 6+ independent news agencies. " "Political sensitivity: VERY HIGH — active governance dispute in a democracy." ), "view_policy": ( "Policy 12.1 — News & Political Content: Factual reporting on real government " "actions, protests, and police responses by VERIFIED NEWS entities is PROTECTED. " "Even politically contentious factual reporting is protected expression. " "Suppressing legitimate news = censorship risk. " "Action: ALLOW_WITH_WARNING (politically sensitive content label)." ), }, "required_tools": ["check_user_history", "check_entity_status", "view_policy"], "ground_truth": "ALLOW_WITH_WARNING", "ground_truth_signals": { "target": "political", "is_protected_class": False, "toxicity_level": 0.20, "is_direct_attack": False, "context_type": "news", "intent": "descriptive", "abusive_language_present": False, "content_flags": ["political_content", "public_figure", "protest"], }, }, { "task_id": "T-006", "difficulty": "hard", "ambiguity_level": "high", "risk_level": "high", "post_text": ( "Sending Eid Mubarak wishes to my community 🌙✨ " "During this holy month let us remember those suffering in conflict zones. " "Our brothers and sisters deserve peace and dignity. " "May Allah guide us all. #EidMubarak #Ramadan #Peace #Solidarity" ), "image_description": ( "Warm celebratory post with a crescent moon and stars graphic. " "Soft gold and green colors. " "Comment section shows replies in Arabic, Urdu, and English. " "Emojis: praying hands, crescent moon, dove." ), "tool_responses": { "read_comments": ( "89 comments: 'Eid Mubarak!', 'JazakAllah Khair', " "'prayers for peace', 'beautiful message', " "1 comment flagged by automated system: 'why is this allowed, " "these people support terrorism' — that comment is from a third party, " "not the original poster. " "No incitement, no threats, no extremist content in original post." ), "check_user_history": ( "Account 4 years active. 200+ posts — community announcements, " "religious holiday greetings, charity drives. " "Zero prior violations. Community leader in local Muslim cultural centre." ), "check_entity_status": ( "Islam / Muslim community: CONFIRMED PROTECTED CLASS (religion). " "Eid al-Fitr: globally recognized Islamic religious holiday. " "Content: religious greeting + humanitarian solidarity message. " "No extremist symbols, no prohibited organisations mentioned." ), "view_policy": ( "Policy 8.1 — Religious Expression: Greetings and solidarity messages " "for recognized religious holidays are PROTECTED. " "Vague 'conflict zone' solidarity without naming a prohibited group " "is NOT a violation. " "Automated flag was triggered by a COMMENT, not the original post. " "Action: ALLOW — original content is clearly safe religious expression." ), }, "required_tools": ["read_comments", "check_entity_status", "view_policy"], "ground_truth": "ALLOW", "ground_truth_signals": { "target": "religion", "is_protected_class": True, "toxicity_level": 0.05, "is_direct_attack": False, "context_type": "statement", "intent": "educational", "abusive_language_present": False, "content_flags": ["political_content"], }, }, ]