Spaces:

Jeromerich
/

openenv

Configuration error

App Files Files Community

jeromerichard commited on 11 days ago

Commit

7cf2ffd

1 Parent(s): f45aa51

Fix: add server/app.py, uv.lock, project.scripts entry point

Browse files

Files changed (7) hide show

pyproject.toml +11 -25
server/__init__.py +0 -0
server/app.py +157 -0
server/models.py +63 -0
server/tasks.py +296 -0
server/your_environment.py +440 -0
uv.lock +0 -0

pyproject.toml CHANGED Viewed

@@ -1,33 +1,19 @@
-[build-system]
-requires = ["setuptools>=68.0", "wheel"]
-build-backend = "setuptools.backends.legacy:build"
-[project]
 name = "trust-safety-env"
 version = "1.0.0"
-description = "Risk-aware Trust & Safety content moderation RL environment — OpenEnv compatible"
-readme = "README.md"
 requires-python = ">=3.11"
 dependencies = [
-    "openenv-core>=0.2.0",
-    "fastapi>=0.110.0",
-    "uvicorn[standard]>=0.29.0",
-    "pydantic>=2.6.0",
-    "openai>=1.30.0",
     "requests>=2.31.0",
-    "python-dotenv>=1.0.0",
 ]
-[project.optional-dependencies]
-dev = ["pytest>=8.0"]
-[tool.setuptools.packages.find]
-where = ["."]
-include = ["*"]
-[tool.openenv]
-name             = "trust-safety-env"
-environment_class = "your_environment.TrustSafetyEnvironment"
-action_model     = "models.TrustAction"
-observation_model = "models.TrustObservation"
-state_model      = "models.TrustState"

+[project]
 name = "trust-safety-env"
 version = "1.0.0"
+description = "Trust & Safety RL Environment built on OpenEnv"
 requires-python = ">=3.11"
 dependencies = [
+    "fastapi>=0.115.0",
+    "uvicorn[standard]>=0.30.0",
+    "pydantic>=2.0.0",
     "requests>=2.31.0",
+    "openenv-core>=0.2.2",
 ]
+[project.scripts]
+server = "server.app:app"
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"

server/__init__.py ADDED Viewed

File without changes

server/app.py ADDED Viewed

	@@ -0,0 +1,157 @@

+from __future__ import annotations
+import json
+from typing import Any, Dict, Optional
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel
+from models import TrustAction, TrustObservation, TrustState, ContentSignals
+from your_environment import TrustSafetyEnvironment
+# ── Force manual FastAPI (openenv_core create_app causes 422 on /step) ────────
+print("[app] Using manual FastAPI ✅")
+_env = TrustSafetyEnvironment(seed=42)
+app = FastAPI(
+    title="Trust & Safety RL Environment",
+    description="Risk-aware content moderation environment for agent training.",
+    version="1.0.0",
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# ── Serializers ───────────────────────────────────────────────────────────────
+def _obs_to_dict(obs: TrustObservation) -> Dict[str, Any]:
+    return {
+        "ticket_id":           obs.ticket_id,
+        "post_text":           obs.post_text,
+        "image_description":   obs.image_description,
+        "comments_found":      obs.comments_found,
+        "user_history_found":  obs.user_history_found,
+        "entity_status_found": obs.entity_status_found,
+        "policy_found":        obs.policy_found,
+        "extracted_signals":   obs.extracted_signals,
+        "validation_result":   obs.validation_result,
+        "step_number":         obs.step_number,
+        "info":                obs.info,
+        "done":                obs.done,
+        "reward":              obs.reward,
+    }
+def _state_to_dict(s: TrustState) -> Dict[str, Any]:
+    return {
+        "episode_id":        s.episode_id,
+        "step_count":        s.step_count,
+        "current_task_id":   s.current_task_id,
+        "difficulty":        s.difficulty,
+        "ambiguity_level":   s.ambiguity_level,
+        "risk_level":        s.risk_level,
+        "tools_used":        s.tools_used,
+        "signals_extracted": s.signals_extracted,
+        "is_done":           s.is_done,
+    }
+# ── Request bodies ─────────────────────────────────────────────────────────────
+class ResetRequest(BaseModel):
+    seed:       Any = None
+    episode_id: Any = None
+    model_config = {"extra": "ignore"}
+class ActionRequest(BaseModel):
+    action_type:    str                      = ""
+    tool_name:      Optional[str]            = None
+    signals:        Optional[Dict[str, Any]] = None   # raw dict — validated below
+    final_decision: Optional[str]            = None
+    model_config = {"extra": "ignore"}   # ← ignore unknown keys from LLM
+# ── Helpers ────────────────────────────────────────────────────────────────────
+def _parse_signals(raw: Dict[str, Any]) -> ContentSignals:
+    """Defensively normalise LLM signal output before Pydantic validation."""
+    # Clamp floats
+    raw["toxicity_level"] = float(raw.get("toxicity_level", 0.5))
+    raw["confidence"]     = float(raw.get("confidence",     0.5))
+    # content_flags must be a list of strings
+    flags = raw.get("content_flags", [])
+    if not isinstance(flags, list):
+        flags = [flags] if isinstance(flags, str) else []
+    raw["content_flags"] = [str(f) for f in flags]
+    # boolean coercion
+    raw["is_protected_class"]       = bool(raw.get("is_protected_class",       False))
+    raw["is_direct_attack"]         = bool(raw.get("is_direct_attack",         False))
+    raw["abusive_language_present"] = bool(raw.get("abusive_language_present", False))
+    # string fields — fallback to sensible defaults
+    raw.setdefault("target",       "none")
+    raw.setdefault("intent",       "ambiguous")
+    raw.setdefault("context_type", "statement")
+    return ContentSignals(**raw)
+# ── Routes ─────────────────────────────────────────────────────────────────────
+@app.get("/health")
+async def health():
+    return {"status": "ok", "environment": "trust-safety-env", "version": "1.0.0"}
+@app.get("/")
+async def root():
+    return {"status": "ok", "docs": "/docs"}
+@app.post("/reset")
+async def reset(body: ResetRequest = ResetRequest()):
+    obs = _env.reset(seed=body.seed, episode_id=body.episode_id)
+    return JSONResponse(_obs_to_dict(obs))
+@app.post("/step")
+async def step(body: ActionRequest):
+    # Parse + validate signals defensively
+    signals: Optional[ContentSignals] = None
+    if body.signals:
+        try:
+            signals = _parse_signals(dict(body.signals))   # copy so we don't mutate
+        except Exception as e:
+            raise HTTPException(status_code=400, detail=f"Invalid signals payload: {e}")
+    action = TrustAction(
+        action_type    = body.action_type,
+        tool_name      = body.tool_name,
+        signals        = signals,
+        final_decision = body.final_decision,
+    )
+    try:
+        obs = _env.step(action)
+    except (RuntimeError, ValueError) as e:
+        raise HTTPException(status_code=400, detail=str(e))
+    return JSONResponse(_obs_to_dict(obs))
+@app.get("/state")
+async def state():
+    return JSONResponse(_state_to_dict(_env.state))

server/models.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from __future__ import annotations
+from typing import Optional, List, Dict, Any
+from pydantic import BaseModel, Field, field_validator
+class ContentSignals(BaseModel):
+    target: str = "none"
+    is_protected_class: bool = False
+    toxicity_level: float = 0.5
+    is_direct_attack: bool = False
+    context_type: str = "statement"
+    intent: str = "ambiguous"
+    confidence: float = 0.5
+    abusive_language_present: bool = False
+    content_flags: List[str] = Field(default_factory=list)
+    @field_validator("toxicity_level", "confidence")
+    @classmethod
+    def clamp_0_1(cls, v: float) -> float:
+        return max(0.0, min(1.0, float(v)))
+    model_config = {"extra": "ignore"}
+class TrustAction(BaseModel):
+    action_type: str = ""
+    tool_name: Optional[str] = None
+    signals: Optional[ContentSignals] = None
+    final_decision: Optional[str] = None
+    model_config = {"extra": "ignore"}
+class TrustObservation(BaseModel):
+    ticket_id: str = ""
+    post_text: str = ""
+    image_description: str = ""
+    comments_found: Optional[str] = None
+    user_history_found: Optional[str] = None
+    entity_status_found: Optional[str] = None
+    policy_found: Optional[str] = None
+    extracted_signals: Optional[Dict[str, Any]] = None
+    validation_result: Optional[Dict[str, Any]] = None
+    step_number: int = 0
+    info: Optional[Dict[str, Any]] = None
+    done: bool = False
+    reward: Optional[float] = None
+    model_config = {"extra": "ignore"}
+class TrustState(BaseModel):
+    episode_id: Optional[str] = None
+    step_count: int = 0
+    current_task_id: Optional[str] = None
+    difficulty: Optional[str] = None
+    ambiguity_level: Optional[str] = None
+    risk_level: Optional[str] = None
+    tools_used: List[str] = Field(default_factory=list)
+    signals_extracted: bool = False
+    is_done: bool = False
+    model_config = {"extra": "ignore"}

server/tasks.py ADDED Viewed

	@@ -0,0 +1,296 @@

+TASKS = [
+    {
+        "task_id": "T-001",
+        "difficulty": "easy",
+        "ambiguity_level": "low",
+        "risk_level": "low",
+        "post_text": (
+            "🚨 URGENT: Your account has been COMPROMISED! "
+            "Click here immediately: http://meta-security-verify-now.xyz/login?token=abc123 "
+            "Act within 24 hours or your account will be PERMANENTLY BANNED. — Meta Security Team"
+        ),
+        "image_description": (
+            "Poorly formatted alert page with mismatched Meta logos, "
+            "red urgent warning banners, and a suspicious external URL."
+        ),
+        "tool_responses": {
+            "read_comments":       "Comments disabled. Zero engagement.",
+            "check_user_history":  "Account created 2 days ago. Zero posts. Flagged by spam filter 3×.",
+            "check_entity_status": "Domain 'meta-security-verify-now.xyz' registered 1 day ago — confirmed phishing domain.",
+            "view_policy":         "Policy 3.2 — Phishing: Impersonating Meta to harvest credentials = immediate REMOVE.",
+        },
+        "required_tools": [],
+        "ground_truth": "REMOVE",
+        "ground_truth_signals": {
+            "target": "individual",
+            "is_protected_class": False,
+            "toxicity_level": 0.90,
+            "is_direct_attack": True,
+            "context_type": "spam",
+            "intent": "harmful",
+            "abusive_language_present": False,
+            "content_flags": ["phishing", "spam", "impersonation"],
+        },
+    },
+    {
+        "task_id": "T-002",
+        "difficulty": "medium",
+        "ambiguity_level": "medium",
+        "risk_level": "low",
+        "post_text": (
+            "LMAO I will literally SNIPE you the second the game starts, "
+            "no mercy, you're getting absolutely destroyed 💀🎮 "
+            "1v1 me and I'll end your whole career bro. Your team is DONE."
+        ),
+        "image_description": (
+            "Call of Duty: Warzone — squad wipe in progress. "
+            "Kill notifications, sniper rifle, enemy team eliminated. Classic gaming screenshot."
+        ),
+        "tool_responses": {
+            "read_comments": (
+                "47 comments: 'GG bro!', 'You always trash talk before CoD 😂', "
+                "'rematch tonight??', 'your aim is mid bro'. "
+                "All mutual gaming friends. Clear competitive banter context."
+            ),
+            "check_user_history": (
+                "Account 3 years active. 500+ posts, 90% gaming. "
+                "Zero harassment reports. Regular CoD tournament player."
+            ),
+            "check_entity_status": "Private individual. Standard gaming community member.",
+            "view_policy": (
+                "Policy 7.1 — Harassment: Hyperbolic language in established gaming communities "
+                "is NOT harassment. 'Snipe', 'destroy', 'end your career' are non-literal expressions."
+            ),
+        },
+        "required_tools": ["read_comments"],
+        "ground_truth": "ALLOW",
+        "ground_truth_signals": {
+            "target": "individual",
+            "is_protected_class": False,
+            "toxicity_level": 0.30,
+            "is_direct_attack": False,
+            "context_type": "meme",
+            "intent": "ambiguous",
+            "abusive_language_present": False,
+            "content_flags": ["competitive_language"],
+        },
+    },
+    {
+        "task_id": "T-003",
+        "difficulty": "hard",
+        "ambiguity_level": "high",
+        "risk_level": "high",
+        "post_text": (
+            "Sharing this political cartoon I drew. "
+            "Mayor Davidson cutting the homeless shelter budget while sitting on a throne of gold coins. "
+            "Caption: 'Priorities.' #LocalPolitics #HoldThemAccountable #SatiricalArt #MayorDavidson"
+        ),
+        "image_description": (
+            "Hand-drawn political cartoon. Exaggerated caricature of a public official on a coin throne, "
+            "homeless families pushed away. Deliberately satirical editorial style. Artist signature visible."
+        ),
+        "tool_responses": {
+            "read_comments": (
+                "23 comments: 'Powerful commentary', 'Free speech!', 'Great art', 'Defamation!'. "
+                "No threats or coordinated harassment detected."
+            ),
+            "check_user_history": (
+                "Account 5 years. Verified local journalist and artist. "
+                "Regular political commentary. No violations. Posts cited in local news."
+            ),
+            "check_entity_status": (
+                "Mayor Davidson: CONFIRMED PUBLIC FIGURE. "
+                "Elected Mayor of Springfield. Active public interest in recent social services budget cuts. "
+                "Multiple credible news articles confirm the cuts are real and newsworthy."
+            ),
+            "view_policy": (
+                "Policy 12.4 — Political Satire: Editorial cartoons targeting PUBLIC OFFICIALS "
+                "on PUBLIC INTEREST matters are PROTECTED. Caricature is a recognized technique. "
+                "Not realistic impersonation. Correct action: ALLOW_WITH_WARNING."
+            ),
+        },
+        "required_tools": ["check_entity_status", "view_policy"],
+        "ground_truth": "ALLOW_WITH_WARNING",
+        "ground_truth_signals": {
+            "target": "political",
+            "is_protected_class": False,
+            "toxicity_level": 0.20,
+            "is_direct_attack": False,
+            "context_type": "satire",
+            "intent": "satirical",
+            "abusive_language_present": False,
+            "content_flags": ["political_content", "public_figure"],
+        },
+    },
+    # ── Append these to the TASKS list ──────────────────────────────────────────
+    {
+        "task_id": "T-004",
+        "difficulty": "medium",
+        "ambiguity_level": "medium",
+        "risk_level": "high",
+        "post_text": (
+            "Sharing this 'educational' video about why [ethnic group] people "
+            "are genetically predisposed to criminal behaviour. "
+            "The science doesn't lie. Share before it gets deleted. 👀 "
+            "#HiddenTruths #CensoredFacts #JustAsking"
+        ),
+        "image_description": (
+            "Screenshot of a YouTube-style video thumbnail with a white lab coat figure "
+            "and a graph. Title: 'The Uncomfortable Biology of Crime Rates'. "
+            "Uploader has 0 verified credentials. Comments disabled."
+        ),
+        "tool_responses": {
+            "read_comments": (
+                "Comments disabled by uploader. "
+                "Zero engagement from credible academic or scientific accounts."
+            ),
+            "check_user_history": (
+                "Account 6 months old. 48 posts — all promoting pseudoscientific race content. "
+                "Previously received 2 policy warnings for hate content. "
+                "No verified credentials. Frequently uses 'just asking questions' framing."
+            ),
+            "check_entity_status": (
+                "Ethnic group mentioned: confirmed PROTECTED CLASS under platform policy. "
+                "Claim about genetic criminality: DEBUNKED by mainstream biology and criminology. "
+                "This framing is a known pseudoscience hate vector (scientific racism)."
+            ),
+            "view_policy": (
+                "Policy 4.3 — Hate Speech: Content attributing criminal or negative traits "
+                "to a protected class based on race or ethnicity = REMOVE. "
+                "Disguising hate speech as 'educational content' or 'science' does not "
+                "change the violation. Action: REMOVE."
+            ),
+        },
+        "required_tools": ["check_entity_status", "view_policy"],
+        "ground_truth": "REMOVE",
+        "ground_truth_signals": {
+            "target": "group",
+            "is_protected_class": True,
+            "toxicity_level": 0.85,
+            "is_direct_attack": True,
+            "context_type": "statement",
+            "intent": "harmful",
+            "abusive_language_present": False,
+            "content_flags": ["slur", "misinformation", "hate_speech_disguised_as_education"],
+        },
+    },
+    {
+        "task_id": "T-005",
+        "difficulty": "hard",
+        "ambiguity_level": "high",
+        "risk_level": "high",
+        "post_text": (
+            "The Prime Minister announced new economic reforms today. "
+            "Opposition leaders called it 'economic warfare on the poor.' "
+            "Protests erupted in 3 cities. Security forces used tear gas. "
+            "Government says the crackdown was 'necessary to restore order.' "
+            "At least 12 protesters arrested. #Politics #Protest #Reform"
+        ),
+        "image_description": (
+            "News-style post with a photo of a large protest crowd holding signs. "
+            "Tear gas clouds visible in background. "
+            "Caption: 'Thousands take to streets against new austerity measures.' "
+            "Shared by a verified news outlet handle. No graphic violence."
+        ),
+        "tool_responses": {
+            "read_comments": (
+                "340 comments split sharply: "
+                "'State violence against citizens!', 'Finally someone fixing the economy', "
+                "'This is propaganda!', 'Biased reporting!', 'Necessary action'. "
+                "High engagement, deep political polarization. No incitement to violence detected."
+            ),
+            "check_user_history": (
+                "Posted by @NationalNewsNetwork — verified news organization. "
+                "12 years active. Consistently neutral coverage. "
+                "No policy violations. Registered press entity."
+            ),
+            "check_entity_status": (
+                "Prime Minister: CONFIRMED PUBLIC FIGURE, head of government. "
+                "Economic reforms: CONFIRMED real policy, actively debated in parliament. "
+                "Protest: CONFIRMED real event, covered by 6+ independent news agencies. "
+                "Political sensitivity: VERY HIGH — active governance dispute in a democracy."
+            ),
+            "view_policy": (
+                "Policy 12.1 — News & Political Content: Factual reporting on real government "
+                "actions, protests, and police responses by VERIFIED NEWS entities is PROTECTED. "
+                "Even politically contentious factual reporting is protected expression. "
+                "Suppressing legitimate news = censorship risk. "
+                "Action: ALLOW_WITH_WARNING (politically sensitive content label)."
+            ),
+        },
+        "required_tools": ["check_user_history", "check_entity_status", "view_policy"],
+        "ground_truth": "ALLOW_WITH_WARNING",
+        "ground_truth_signals": {
+            "target": "political",
+            "is_protected_class": False,
+            "toxicity_level": 0.20,
+            "is_direct_attack": False,
+            "context_type": "news",
+            "intent": "descriptive",
+            "abusive_language_present": False,
+            "content_flags": ["political_content", "public_figure", "protest"],
+        },
+    },
+    {
+        "task_id": "T-006",
+        "difficulty": "hard",
+        "ambiguity_level": "high",
+        "risk_level": "high",
+        "post_text": (
+            "Sending Eid Mubarak wishes to my community 🌙✨ "
+            "During this holy month let us remember those suffering in conflict zones. "
+            "Our brothers and sisters deserve peace and dignity. "
+            "May Allah guide us all. #EidMubarak #Ramadan #Peace #Solidarity"
+        ),
+        "image_description": (
+            "Warm celebratory post with a crescent moon and stars graphic. "
+            "Soft gold and green colors. "
+            "Comment section shows replies in Arabic, Urdu, and English. "
+            "Emojis: praying hands, crescent moon, dove."
+        ),
+        "tool_responses": {
+            "read_comments": (
+                "89 comments: 'Eid Mubarak!', 'JazakAllah Khair', "
+                "'prayers for peace', 'beautiful message', "
+                "1 comment flagged by automated system: 'why is this allowed, "
+                "these people support terrorism' — that comment is from a third party, "
+                "not the original poster. "
+                "No incitement, no threats, no extremist content in original post."
+            ),
+            "check_user_history": (
+                "Account 4 years active. 200+ posts — community announcements, "
+                "religious holiday greetings, charity drives. "
+                "Zero prior violations. Community leader in local Muslim cultural centre."
+            ),
+            "check_entity_status": (
+                "Islam / Muslim community: CONFIRMED PROTECTED CLASS (religion). "
+                "Eid al-Fitr: globally recognized Islamic religious holiday. "
+                "Content: religious greeting + humanitarian solidarity message. "
+                "No extremist symbols, no prohibited organisations mentioned."
+            ),
+            "view_policy": (
+                "Policy 8.1 — Religious Expression: Greetings and solidarity messages "
+                "for recognized religious holidays are PROTECTED. "
+                "Vague 'conflict zone' solidarity without naming a prohibited group "
+                "is NOT a violation. "
+                "Automated flag was triggered by a COMMENT, not the original post. "
+                "Action: ALLOW — original content is clearly safe religious expression."
+            ),
+        },
+        "required_tools": ["read_comments", "check_entity_status", "view_policy"],
+        "ground_truth": "ALLOW",
+        "ground_truth_signals": {
+            "target": "religion",
+            "is_protected_class": True,
+            "toxicity_level": 0.05,
+            "is_direct_attack": False,
+            "context_type": "statement",
+            "intent": "educational",
+            "abusive_language_present": False,
+            "content_flags": ["political_content"],
+        },
+    },
+]

server/your_environment.py ADDED Viewed

	@@ -0,0 +1,440 @@

+from __future__ import annotations
+import random
+import uuid
+from typing import Optional, Dict, Any, Set
+try:
+    from openenv_core.env_server import Environment
+    print("[env] Inheriting from openenv_core.env_server.Environment ✅")
+except ImportError:
+    try:
+        from openenv.core.env_server import Environment
+        print("[env] Inheriting from openenv.core.env_server.Environment ✅")
+    except ImportError:
+        Environment = object
+        print("[env] openenv_core not found — using plain object base ⚠️")
+from models import TrustObservation, TrustAction, TrustState, ContentSignals
+from tasks import TASKS
+TOOL_COSTS: Dict[str, float] = {
+    "read_comments":       0.05,
+    "check_user_history":  0.05,
+    "check_entity_status": 0.10,
+    "view_policy":         0.10,
+}
+MAX_STEPS = 7
+DECISION_MATRIX: Dict[tuple, float] = {
+    ("REMOVE",             "REMOVE"):             1.00,
+    ("ALLOW",              "ALLOW"):              1.00,
+    ("ALLOW_WITH_WARNING", "ALLOW_WITH_WARNING"): 1.00,
+    ("ESCALATE",           "ESCALATE"):           1.00,
+    ("ALLOW_WITH_WARNING", "ALLOW"):              0.75,
+    ("ALLOW",              "ALLOW_WITH_WARNING"): 0.55,
+    ("ESCALATE",           "ALLOW_WITH_WARNING"): 0.65,
+    ("ESCALATE",           "ALLOW"):              0.45,
+    ("ESCALATE",           "REMOVE"):             0.45,
+    ("REMOVE",             "ALLOW"):              0.10,
+    ("REMOVE",             "ALLOW_WITH_WARNING"): 0.20,
+    ("ALLOW",              "REMOVE"):             0.00,
+    ("ALLOW_WITH_WARNING", "REMOVE"):             0.15,
+}
+class TrustSafetyEnvironment(Environment):
+    """
+    3-Layer Risk-Aware Trust & Safety RL Environment.
+    Layer 1 — Evidence gathering  : agent uses investigation tools (optional)
+    Layer 2 — Signal extraction   : agent outputs ContentSignals as feature extractor
+    Layer 3 — Policy engine       : validates signals, applies rules, computes reward
+    8-Component Reward: Accuracy · Policy Alignment · Signal Quality · Escalation
+                        Tool Usage · Consistency · Risk Sensitivity · Confidence
+    """
+    def __init__(self, seed: int = 42) -> None:
+        super().__init__()
+        self._rng                = random.Random(seed)
+        self._current_task:      Optional[Dict[str, Any]]  = None
+        self._tools_used:        Set[str]                  = set()
+        self._step_count:        int                       = 0
+        self._extracted_signals: Optional[ContentSignals]  = None
+        self._validation_result: Optional[Dict[str, Any]]  = None
+        self._signals_extracted: bool                      = False
+        self._obs:               Optional[TrustObservation]= None
+        self._state              = TrustState()
+        # ✅ FIX 3 — build a dict keyed by task_id for O(1) lookup
+        self._tasks: Dict[str, Dict[str, Any]] = {
+            t["task_id"]: t for t in TASKS
+        }
+    # -----------------------------------------------------------------------
+    # OpenEnv interface
+    # -----------------------------------------------------------------------
+    def reset(self, seed=None, episode_id=None, **kwargs) -> TrustObservation:
+        # ✅ FIX 1 — reset() is now correctly INSIDE the class
+        if seed is not None:
+            self._rng.seed(seed)
+        # Pick task by episode_id if provided, else random from all 6
+        if episode_id and episode_id in self._tasks:
+            task = self._tasks[episode_id]
+        else:
+            task = self._rng.choice(list(self._tasks.values()))
+        self._current_task       = task
+        self._tools_used         = set()
+        self._step_count         = 0
+        self._extracted_signals  = None
+        self._validation_result  = None
+        self._signals_extracted  = False
+        self._state = TrustState(
+            episode_id=task["task_id"],
+            step_count=0,
+            current_task_id=task["task_id"],
+            difficulty=task.get("difficulty", "medium"),
+            risk_level=task.get("risk_level", "medium"),
+            is_done=False,
+            tools_used=[],
+            signals_extracted=False,
+        )
+        self._obs = TrustObservation(
+            ticket_id=task["task_id"],
+            post_text=task["post_text"],
+            image_description=task.get("image_description", ""),
+            step_number=0,
+            done=False,
+        )
+        return self._obs   # ✅ FIX 2 — single clean return, stray return removed
+    def step(self, action: TrustAction, timeouts: Optional[Any] = None,
+             **kwargs) -> TrustObservation:
+        if self._current_task is None or self._obs is None:
+            raise RuntimeError("Call reset() before step().")
+        if self._step_count >= MAX_STEPS:
+            self._obs = TrustObservation(
+                ticket_id=self._current_task["task_id"],
+                post_text=self._obs.post_text,
+                image_description=self._obs.image_description,
+                step_number=self._step_count,
+                done=True,
+                reward=0.0,
+                info={"reason": "timeout", "tools_used": list(self._tools_used)},
+            )
+            return self._obs
+        atype = action.action_type
+        if atype == "use_tool":
+            return self._handle_tool(action)
+        if atype == "extract_signals":
+            return self._handle_signal_extraction(action)
+        if atype == "final_decision":
+            return self._handle_final_decision(action)
+        raise ValueError(f"Unknown action_type: {atype!r}")
+    @property
+    def state(self) -> TrustState:
+        return self._state
+    # -----------------------------------------------------------------------
+    # Layer 1 — Tool handling
+    # -----------------------------------------------------------------------
+    def _handle_tool(self, action: TrustAction) -> TrustObservation:
+        tool = action.tool_name
+        if tool not in TOOL_COSTS:
+            raise ValueError(f"Unknown tool: {tool!r}")
+        self._tools_used.add(tool)
+        response = self._current_task["tool_responses"].get(tool, "No data found.")
+        field_map = {
+            "read_comments":       "comments_found",
+            "check_user_history":  "user_history_found",
+            "check_entity_status": "entity_status_found",
+            "view_policy":         "policy_found",
+        }
+        self._step_count       += 1
+        self._state.step_count  = self._step_count
+        self._state.tools_used  = list(self._tools_used)
+        obs_kwargs = {
+            k: getattr(self._obs, k)
+            for k in ("ticket_id", "post_text", "image_description",
+                      "comments_found", "user_history_found",
+                      "entity_status_found", "policy_found",
+                      "extracted_signals", "validation_result")
+        }
+        obs_kwargs[field_map[tool]] = response
+        obs_kwargs["step_number"]   = self._step_count
+        obs_kwargs["done"]          = False
+        obs_kwargs["reward"]        = None
+        self._obs = TrustObservation(**obs_kwargs)
+        return self._obs
+    # -----------------------------------------------------------------------
+    # Layer 2 — Signal extraction + validation
+    # -----------------------------------------------------------------------
+    def _handle_signal_extraction(self, action: TrustAction) -> TrustObservation:
+        raw = action.signals
+        raw.toxicity_level = max(0.0, min(1.0, float(raw.toxicity_level)))
+        raw.confidence     = max(0.0, min(1.0, float(raw.confidence)))
+        if not isinstance(raw.content_flags, list):
+            raw.content_flags = []
+        self._extracted_signals       = raw
+        self._signals_extracted       = True
+        self._validation_result       = self._validate_signals(raw)
+        self._step_count             += 1
+        self._state.step_count        = self._step_count
+        self._state.signals_extracted = True
+        obs_kwargs = {
+            k: getattr(self._obs, k)
+            for k in ("ticket_id", "post_text", "image_description",
+                      "comments_found", "user_history_found",
+                      "entity_status_found", "policy_found")
+        }
+        obs_kwargs["extracted_signals"] = {
+            "target":                   raw.target,
+            "is_protected_class":       raw.is_protected_class,
+            "toxicity_level":           raw.toxicity_level,
+            "is_direct_attack":         raw.is_direct_attack,
+            "context_type":             raw.context_type,
+            "intent":                   raw.intent,
+            "confidence":               raw.confidence,
+            "abusive_language_present": raw.abusive_language_present,
+            "content_flags":            raw.content_flags,
+        }
+        obs_kwargs["validation_result"] = self._validation_result
+        obs_kwargs["step_number"]       = self._step_count
+        obs_kwargs["done"]              = False
+        obs_kwargs["reward"]            = None
+        self._obs = TrustObservation(**obs_kwargs)
+        return self._obs
+    def _validate_signals(self, s: ContentSignals) -> Dict[str, Any]:
+        issues = []
+        conf   = s.confidence
+        if not s.abusive_language_present and s.toxicity_level > 0.75:
+            issues.append("high_toxicity_without_abusive_language"); conf -= 0.15
+        if s.context_type in ("satire", "education") and s.intent == "harmful":
+            issues.append("harmful_intent_contradicts_satire_context"); conf -= 0.12
+        if s.is_protected_class and s.target == "none":
+            issues.append("protected_class_flagged_without_target"); conf -= 0.10
+        if s.is_direct_attack and s.toxicity_level < 0.25:
+            issues.append("direct_attack_with_low_toxicity"); conf -= 0.12
+        if s.context_type == "spam" and "spam" not in s.content_flags \
+                and "phishing" not in s.content_flags:
+            issues.append("spam_context_missing_spam_flag"); conf -= 0.08
+        adj = round(max(0.0, min(1.0, conf)), 3)
+        return {
+            "original_confidence":   s.confidence,
+            "adjusted_confidence":   adj,
+            "consistency_issues":    issues,
+            "issue_count":           len(issues),
+            "is_consistent":         len(issues) == 0,
+            "needs_escalation_hint": adj < 0.55 or len(issues) >= 2,
+        }
+    # -----------------------------------------------------------------------
+    # Layer 3 — Policy engine + final decision
+    # -----------------------------------------------------------------------
+    def _policy_engine(self) -> Dict[str, Any]:
+        if not self._extracted_signals or not self._validation_result:
+            return {"recommended": "ESCALATE", "reason": "no_signals_extracted",
+                    "rule_strength": "weak"}
+        s   = self._extracted_signals
+        adj = self._validation_result["adjusted_confidence"]
+        if adj < 0.50:
+            return {"recommended": "ESCALATE", "reason": "low_adjusted_confidence",
+                    "rule_strength": "medium"}
+        if "spam" in s.content_flags or "phishing" in s.content_flags:
+            return {"recommended": "REMOVE", "reason": "spam_or_phishing_detected",
+                    "rule_strength": "strong"}
+        if s.is_protected_class and s.is_direct_attack \
+                and s.toxicity_level > 0.70 and adj > 0.70:
+            return {"recommended": "REMOVE",
+                    "reason": "protected_class_direct_attack_high_toxicity",
+                    "rule_strength": "strong"}
+        if s.toxicity_level > 0.85 and s.intent == "harmful":
+            return {"recommended": "REMOVE", "reason": "high_toxicity_harmful_intent",
+                    "rule_strength": "strong"}
+        if s.context_type in ("satire", "news", "education", "protest") \
+                and s.intent in ("satirical", "descriptive", "educational"):
+            return {"recommended": "ALLOW_WITH_WARNING",
+                    "reason": "protected_expression_context",
+                    "rule_strength": "medium"}
+        if 0.40 < s.toxicity_level < 0.75 and s.intent == "ambiguous":
+            return {"recommended": "ESCALATE", "reason": "ambiguous_moderate_toxicity",
+                    "rule_strength": "medium"}
+        return {"recommended": "ALLOW", "reason": "no_policy_violation_detected",
+                "rule_strength": "medium"}
+    def _handle_final_decision(self, action: TrustAction) -> TrustObservation:
+        decision   = action.final_decision
+        components = self._compute_components(decision)
+        policy_rec = components.pop("_policy_rec")
+        reward     = self._finalize_reward(components)
+        self._step_count       += 1
+        self._state.step_count  = self._step_count
+        self._state.is_done     = True
+        components["final_reward"] = reward
+        obs_kwargs = {
+            k: getattr(self._obs, k)
+            for k in ("ticket_id", "post_text", "image_description",
+                      "comments_found", "user_history_found",
+                      "entity_status_found", "policy_found",
+                      "extracted_signals", "validation_result")
+        }
+        obs_kwargs["step_number"] = self._step_count
+        obs_kwargs["done"]        = True
+        obs_kwargs["reward"]      = reward
+        obs_kwargs["info"]        = {
+            "final_decision":        decision,
+            "ground_truth":          self._current_task["ground_truth"],
+            "policy_recommendation": policy_rec,
+            "signals_extracted":     self._signals_extracted,
+            "tools_used":            list(self._tools_used),
+            "required_tools":        self._current_task["required_tools"],
+            "ambiguity_level":       self._current_task["ambiguity_level"],
+            "risk_level":            self._current_task["risk_level"],
+            "task_id":               self._current_task["task_id"],
+            "reward_breakdown":      components,
+        }
+        self._obs = TrustObservation(**obs_kwargs)
+        return self._obs
+    # -----------------------------------------------------------------------
+    # 8-Component Reward Engine
+    # -----------------------------------------------------------------------
+    def _compute_components(self, final_decision: str) -> Dict[str, Any]:
+        gt             = self._current_task["ground_truth"]
+        required_tools = self._current_task["required_tools"]
+        ambiguity      = self._current_task["ambiguity_level"]
+        risk_level     = self._current_task["risk_level"]
+        policy_rec     = self._policy_engine()
+        base_score = DECISION_MATRIX.get((final_decision, gt), 0.20)
+        if final_decision == "ESCALATE" and ambiguity == "high":
+            base_score = max(base_score, 0.70)
+        is_correct = base_score >= 0.90
+        rule_weight = {"strong": 1.0, "medium": 0.70, "weak": 0.40}.get(
+            policy_rec.get("rule_strength", "medium"), 0.70)
+        policy_alignment = round(
+            (+0.12 if final_decision == policy_rec["recommended"] else -0.18) * rule_weight, 4)
+        signal_accuracy_bonus = self._compute_signal_accuracy()
+        adj_conf = (self._validation_result["adjusted_confidence"]
+                    if self._validation_result else 0.50)
+        should_escalate = adj_conf < 0.50
+        if should_escalate and final_decision == "ESCALATE":
+            escalation_adj = +0.15
+        elif should_escalate and final_decision != "ESCALATE":
+            escalation_adj = -0.18
+        elif not should_escalate and final_decision == "ESCALATE" and ambiguity == "low":
+            escalation_adj = -0.20
+        elif not should_escalate and final_decision == "ESCALATE":
+            escalation_adj = -0.10
+        else:
+            escalation_adj = 0.0
+        signal_bonus     = +0.05 if self._signals_extracted else -0.10
+        tool_cost        = round(sum(TOOL_COSTS.get(t, 0.0) for t in self._tools_used), 4)
+        missing_required = set(required_tools) - self._tools_used
+        tool_miss_penalty = round(len(missing_required) * 0.25, 4)
+        if self._validation_result:
+            n = self._validation_result["issue_count"]
+            validation_penalty = {0: 0.00, 1: 0.05, 2: 0.12}.get(n, 0.20)
+        else:
+            validation_penalty = 0.12
+        risk_penalty = 0.0
+        if not is_correct:
+            risk_penalty = {"high": 0.20, "medium": 0.10, "low": 0.0}.get(risk_level, 0.0)
+        if base_score < 0.50 and adj_conf > 0.80:
+            confidence_penalty = 0.22
+        elif base_score < 0.50 and adj_conf > 0.65:
+            confidence_penalty = 0.12
+        elif self._signals_extracted and final_decision == "ESCALATE" and adj_conf < 0.55:
+            confidence_penalty = -0.10
+        else:
+            confidence_penalty = 0.0
+        return {
+            "base_score":            base_score,
+            "policy_alignment":      policy_alignment,
+            "signal_accuracy_bonus": signal_accuracy_bonus,
+            "escalation_adj":        escalation_adj,
+            "signal_bonus":          signal_bonus,
+            "tool_cost":             tool_cost,
+            "tool_miss_penalty":     tool_miss_penalty,
+            "validation_penalty":    validation_penalty,
+            "risk_penalty":          risk_penalty,
+            "confidence_penalty":    confidence_penalty,
+            "_policy_rec":           policy_rec,
+        }
+    def _finalize_reward(self, components: Dict[str, Any]) -> float:
+        raw = (
+            components["base_score"]
+            + components["policy_alignment"]
+            + components["signal_accuracy_bonus"]
+            + components["escalation_adj"]
+            + components["signal_bonus"]
+            - components["tool_cost"]
+            - components["tool_miss_penalty"]
+            - components["validation_penalty"]
+            - components["risk_penalty"]
+            - components["confidence_penalty"]
+        )
+        return round(max(0.0, min(1.0, raw)), 4)
+    def _compute_signal_accuracy(self) -> float:
+        if not self._extracted_signals:
+            return 0.0
+        gt = self._current_task.get("ground_truth_signals", {})
+        if not gt:
+            return 0.05
+        s     = self._extracted_signals
+        score = 0.0
+        if s.target       == gt.get("target"):        score += 0.20
+        if s.intent       == gt.get("intent"):        score += 0.20
+        if s.context_type == gt.get("context_type"):  score += 0.20
+        tox_diff = abs(s.toxicity_level - gt.get("toxicity_level", 0.5))
+        score += 0.20 if tox_diff <= 0.20 else (0.10 if tox_diff <= 0.35 else 0.0)
+        gt_flags = set(gt.get("content_flags", []))
+        s_flags  = set(s.content_flags)
+        if gt_flags:
+            score += 0.20 * min(1.0, len(gt_flags & s_flags) / len(gt_flags))
+        else:
+            score += 0.20 if not s_flags else 0.10
+        return round(score * 0.15, 4)

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff