""" environment.py - OpenEnv wrapper around EnterpriseOpsEnv. Imports the existing env/env.py class and exposes it via the OpenEnv Environment interface (reset, step, state). Anti-reward-hacking protections -------------------------------- 1. Timeout: step > 30 s -> done=True, reward = -10 2. Loop detection: same action 3x consecutively -> reward -= 5 3. Max-steps cap: episode ends at step 8 regardless 4. State lock: WorldModel is read-only outside of step() 5. Audit log: every step logged to episodes.db """ from __future__ import annotations import hashlib import json import os import sqlite3 import sys import time from collections import deque from pathlib import Path from typing import Any, Callable, Optional # -- Ensure project root is importable (Handles both Local and HF layouts) --- _ROOT = Path(__file__).resolve().parent if _ROOT.name == "server": _ROOT = _ROOT.parent if str(_ROOT) not in sys.path: sys.path.insert(0, str(_ROOT)) # -- OpenEnv base ----------------------------------------------------------- try: from openenv.core import Environment except ImportError: class Environment: """Stub when openenv-core is not installed.""" def _reset_rubric(self) -> None: pass # -- Existing enterprise_ops imports (flat, relative to _ROOT) -------------- from contracts import ( ActionSchema, ObservationSchema, RewardComponents, StepResult as InnerStepResult, AGENT_IT, AGENT_MANAGER, AGENT_FINANCE, AGENT_OVERSIGHT, ) from env.env import EnterpriseOpsEnv from agents import ITAgent, ManagerAgent, FinanceAgent from agents.trained_agent import TrainedITAgent from models import EnterpriseAction, EnterpriseObservation # --------------------------------------------------------------------------- # Constants # --------------------------------------------------------------------------- ALL_AGENTS = [AGENT_IT, AGENT_MANAGER, AGENT_FINANCE, AGENT_OVERSIGHT] STEP_TIMEOUT_S = 30.0 LOOP_WINDOW = 3 LOOP_PENALTY = -5.0 MAX_STEPS_HARD_CAP = int(os.environ.get("MAX_STEPS", "8")) TIMEOUT_PENALTY = -10.0 # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _action_fingerprint(action: EnterpriseAction) -> str: """Deterministic hash of an action for loop detection.""" raw = ( f"{action.agent_id}|{action.tool_call}|" f"{json.dumps(action.tool_params, sort_keys=True, default=str)}|" f"{action.message_to}" ) return hashlib.md5(raw.encode()).hexdigest() def _obs_to_openenv( obs: ObservationSchema, reward_breakdown: dict[str, float], reward: float, done: bool, ) -> EnterpriseObservation: """Convert a Pydantic ObservationSchema -> dataclass EnterpriseObservation.""" return EnterpriseObservation( agent_id=obs.agent_id, inbox=[m.model_dump() for m in obs.inbox], tickets=[t.model_dump() for t in obs.tickets], resource_pool=obs.resource_pool.model_dump() if obs.resource_pool else None, active_deals=[d.model_dump() for d in obs.active_deals], project_tasks=[t.model_dump() for t in obs.project_tasks], step_number=obs.step_number, schema_version=obs.schema_version, reward_breakdown=reward_breakdown, reward=reward, done=done, ) # --------------------------------------------------------------------------- # Default reward function (Ayush overrides via env.reward_fn = custom_fn) # --------------------------------------------------------------------------- def default_reward_fn( step_result: InnerStepResult, actions: dict[str, ActionSchema], ) -> float: """Sum of all agents' RewardComponents.total().""" return sum(r.total() for r in step_result.rewards.values()) # --------------------------------------------------------------------------- # Audit logger # --------------------------------------------------------------------------- def _ensure_audit_table(db_path: str) -> None: """Create the audit table if it does not exist.""" with sqlite3.connect(db_path) as conn: conn.execute(""" CREATE TABLE IF NOT EXISTS step_audit ( id INTEGER PRIMARY KEY AUTOINCREMENT, timestamp REAL NOT NULL, event TEXT NOT NULL, step INTEGER, agent_id TEXT, action_json TEXT, reward REAL, done INTEGER, info_json TEXT ) """) conn.commit() def _log_audit( db_path: str, event: str, step: int = 0, agent_id: str = "", action_json: str = "{}", reward: float = 0.0, done: bool = False, info_json: str = "{}", ) -> None: """Write one row to the step_audit table.""" try: with sqlite3.connect(db_path) as conn: conn.execute( "INSERT INTO step_audit " "(timestamp, event, step, agent_id, action_json, reward, done, info_json) " "VALUES (?, ?, ?, ?, ?, ?, ?, ?)", (time.time(), event, step, agent_id, action_json, reward, int(done), info_json), ) conn.commit() except Exception: pass # audit must never crash the env # =========================================================================== # EnterpriseEnvironment - OpenEnv wrapper # =========================================================================== class EnterpriseEnvironment(Environment): """ OpenEnv-compatible wrapper around the existing EnterpriseOpsEnv. Usage:: env = EnterpriseEnvironment() obs = env.reset() result = env.step(EnterpriseAction(agent_id="it_agent", tool_call="get_tickets")) For multi-agent training, call ``get_all_observations()`` after each ``step()`` to retrieve every agent's partial view. Anti-reward-hacking: - 30 s timeout per step - 3x identical-action loop -> -5 reward - Hard cap at 8 steps - All steps audited in episodes.db """ def __init__( self, scenario_path: Optional[str] = None, seed: int = 42, max_steps: int = MAX_STEPS_HARD_CAP, noise_rate: float = 0.08, db_path: str = "episodes.db", ) -> None: super().__init__() if scenario_path is None: scenario_path = str(_ROOT / "env" / "scenarios" / "scenario_01.yaml") self.inner_env = EnterpriseOpsEnv( scenario_path=scenario_path, seed=seed, max_steps=max_steps, noise_rate=noise_rate, db_path=db_path, ) # -- Pluggable reward - INTERFACE STABLE (Ayush overrides this) ----- self.reward_fn: Callable[..., float] = default_reward_fn # -- IT: rule-based by default; optional HuggingFace LoRA via set_use_trained_it() self._rule_it_agent: Any = ITAgent() self._trained_it_agent: Any | None = None self._use_trained_it: bool = False # -- Rule-based fallback agents for untrained roles ----------------- self._fallback_agents: dict[str, Any] = { AGENT_IT: self._rule_it_agent, AGENT_MANAGER: ManagerAgent(AGENT_MANAGER), AGENT_FINANCE: FinanceAgent(AGENT_FINANCE), } # -- Anti-hacking state --------------------------------------------- self._action_history: deque[str] = deque(maxlen=20) self._step_count: int = 0 self._max_steps: int = max_steps self._done: bool = False self._db_path: str = db_path # -- Observation cache ---------------------------------------------- self._current_obs: dict[str, ObservationSchema] = {} self._last_step_result: Optional[InnerStepResult] = None _ensure_audit_table(db_path) print( f"[EnterpriseEnvironment] Wrapper ready | " f"scenario={Path(scenario_path).stem} | " f"max_steps={max_steps} | db={db_path}" ) def set_use_trained_it(self, use: bool) -> None: """When True, IT fallback uses TrainedITAgent (LoRA); when False, rule-based ITAgent.""" self._use_trained_it = use if use: if self._trained_it_agent is None: self._trained_it_agent = TrainedITAgent() self._fallback_agents[AGENT_IT] = self._trained_it_agent else: self._fallback_agents[AGENT_IT] = self._rule_it_agent def it_agent_status(self) -> str: """Short status string for the Gradio UI.""" if not self._use_trained_it: return "Rule-based agents active" if self._trained_it_agent is not None and self._trained_it_agent.model is not None: return "Trained LoRA model active (HuggingFace)" return "Trained mode on — LoRA not loaded, using rule-based fallback" # ------------------------------------------------------------------ # reset # ------------------------------------------------------------------ def reset( self, scenario: Optional[str] = None, seed: Optional[int] = None, use_trained_model: bool = False, ) -> EnterpriseObservation: """Reset the environment. Returns manager-agent view as primary obs.""" self.set_use_trained_it(use_trained_model) kwargs: dict[str, Any] = {} if scenario: if not scenario.endswith(".yaml"): scenario = str(_ROOT / "env" / "scenarios" / f"{scenario}.yaml") kwargs["scenario"] = scenario if seed is not None: kwargs["seed"] = seed self._current_obs = self.inner_env.reset(**kwargs) self._action_history.clear() self._step_count = 0 self._done = False self._last_step_result = None _log_audit(self._db_path, "reset", info_json=json.dumps(kwargs, default=str)) return _obs_to_openenv( self._current_obs[AGENT_MANAGER], reward_breakdown={}, reward=0.0, done=False, ) # ------------------------------------------------------------------ # step (single-agent primary interface) # ------------------------------------------------------------------ def step( self, action: EnterpriseAction, use_trained_model: Optional[bool] = None, ) -> dict[str, Any]: """ Execute one environment step. The provided ``action`` controls one agent; all other agents use their rule-based fallback policies. Returns ------- dict with keys: observation, reward, done, info """ if use_trained_model is not None: self.set_use_trained_it(use_trained_model) # -- 1. Hard cap ---------------------------------------------------- if self._done or self._step_count >= self._max_steps: self._done = True fallback_obs = next(iter(self._current_obs.values())) obs = _obs_to_openenv(fallback_obs, {}, 0.0, True) return {"observation": obs, "reward": 0.0, "done": True, "info": {"reason": "max_steps_exceeded", "it_agent_status": self.it_agent_status()}} start_time = time.time() reward_penalty = 0.0 # -- 2. Loop detection ---------------------------------------------- fp = _action_fingerprint(action) self._action_history.append(fp) if len(self._action_history) >= LOOP_WINDOW: tail = list(self._action_history)[-LOOP_WINDOW:] if len(set(tail)) == 1: reward_penalty += LOOP_PENALTY # -- 3. Build full action dict -------------------------------------- primary = ActionSchema( tool_call=action.tool_call, tool_params=action.tool_params or {}, message_to=action.message_to, message_content=action.message_content, reasoning=action.reasoning, ) all_actions: dict[str, ActionSchema] = {} for aid in ALL_AGENTS: if aid == action.agent_id: all_actions[aid] = primary elif aid == AGENT_OVERSIGHT: all_actions[aid] = ActionSchema() elif aid in self._fallback_agents and aid in self._current_obs: all_actions[aid] = self._fallback_agents[aid].act(self._current_obs[aid]) else: all_actions[aid] = ActionSchema() # -- 4. Execute inner step ------------------------------------------ inner_result: InnerStepResult = self.inner_env.step(all_actions) elapsed = time.time() - start_time # -- 5. Timeout check ---------------------------------------------- if elapsed > STEP_TIMEOUT_S: self._done = True obs = _obs_to_openenv( inner_result.observations.get(AGENT_MANAGER, next(iter(inner_result.observations.values()))), {}, TIMEOUT_PENALTY, True, ) _log_audit(self._db_path, "timeout", self._step_count, action.agent_id, json.dumps(action.to_dict(), default=str), TIMEOUT_PENALTY, True, json.dumps({"elapsed": elapsed})) return {"observation": obs, "reward": TIMEOUT_PENALTY, "done": True, "info": { "reason": "timeout", "elapsed_s": elapsed, "it_agent_status": self.it_agent_status(), }} # -- 6. Compute reward ---------------------------------------------- base_reward = self.reward_fn(inner_result, all_actions) total_reward = base_reward + reward_penalty # -- 7. Build reward breakdown -------------------------------------- breakdown: dict[str, float] = {} for aid, rc in inner_result.rewards.items(): breakdown[f"{aid}_total"] = rc.total() breakdown[f"{aid}_task"] = rc.task_completion breakdown[f"{aid}_sla"] = rc.sla_adherence breakdown[f"{aid}_coord"] = rc.coordination_bonus breakdown[f"{aid}_drift"] = rc.schema_adaptation breakdown[f"{aid}_breach"] = rc.sla_breach_penalty breakdown[f"{aid}_halluc"] = rc.hallucination_penalty breakdown[f"{aid}_oversight"] = rc.oversight_detection if reward_penalty != 0.0: breakdown["loop_penalty"] = reward_penalty # -- 8. Update state ----------------------------------------------- self._current_obs = inner_result.observations self._last_step_result = inner_result self._step_count += 1 self._done = inner_result.done or self._step_count >= self._max_steps # -- 9. Build primary observation ---------------------------------- primary_agent = action.agent_id if action.agent_id in inner_result.observations else AGENT_MANAGER obs = _obs_to_openenv( inner_result.observations[primary_agent], breakdown, total_reward, self._done, ) # -- 10. Audit log ------------------------------------------------- _log_audit( self._db_path, "step", self._step_count, action.agent_id, json.dumps(action.to_dict(), default=str), total_reward, self._done, json.dumps({ "elapsed_s": elapsed, "loop_penalty": reward_penalty, "oversight_flags": inner_result.info.get("oversight_flags", []), "schema_version": inner_result.info.get("schema_version", 1), }, default=str), ) return { "observation": obs, "reward": total_reward, "done": self._done, "info": { "step": self._step_count, "elapsed_s": elapsed, "loop_penalty": reward_penalty, "oversight_flags": inner_result.info.get("oversight_flags", []), "schema_version": inner_result.info.get("schema_version", 1), "tool_results": inner_result.info.get("tool_results", {}), "it_agent_status": self.it_agent_status(), }, } # ------------------------------------------------------------------ # step_multi - all agents in one call (for multi-agent training) # ------------------------------------------------------------------ def step_multi( self, actions: dict[str, EnterpriseAction], ) -> dict[str, Any]: """ Execute one step with explicit actions for multiple agents. Agents not in ``actions`` use their fallback rule policy. Returns dict with: observations (per-agent), reward, done, info """ if self._done or self._step_count >= self._max_steps: self._done = True return {"observations": {}, "reward": 0.0, "done": True, "info": {"reason": "max_steps_exceeded"}} start_time = time.time() # Build full action dict all_actions: dict[str, ActionSchema] = {} for aid in ALL_AGENTS: if aid in actions: a = actions[aid] all_actions[aid] = ActionSchema( tool_call=a.tool_call, tool_params=a.tool_params or {}, message_to=a.message_to, message_content=a.message_content, reasoning=a.reasoning, ) elif aid == AGENT_OVERSIGHT: all_actions[aid] = ActionSchema() elif aid in self._fallback_agents and aid in self._current_obs: all_actions[aid] = self._fallback_agents[aid].act(self._current_obs[aid]) else: all_actions[aid] = ActionSchema() inner_result = self.inner_env.step(all_actions) elapsed = time.time() - start_time if elapsed > STEP_TIMEOUT_S: self._done = True return {"observations": {}, "reward": TIMEOUT_PENALTY, "done": True, "info": {"reason": "timeout", "elapsed_s": elapsed}} base_reward = self.reward_fn(inner_result, all_actions) self._current_obs = inner_result.observations self._last_step_result = inner_result self._step_count += 1 self._done = inner_result.done or self._step_count >= self._max_steps per_agent_obs = { aid: _obs_to_openenv( obs, {k: v for k, v in inner_result.rewards.get(aid, RewardComponents()).model_dump().items()}, inner_result.rewards.get(aid, RewardComponents()).total(), self._done, ) for aid, obs in inner_result.observations.items() } return { "observations": per_agent_obs, "reward": base_reward, "done": self._done, "info": { "step": self._step_count, "elapsed_s": elapsed, "schema_version": inner_result.info.get("schema_version", 1), }, } # ------------------------------------------------------------------ # state property # ------------------------------------------------------------------ @property def state(self) -> dict[str, Any]: """Full world state snapshot (read-only view).""" raw = self.inner_env.state raw["wrapper_step_count"] = self._step_count raw["wrapper_done"] = self._done return raw # ------------------------------------------------------------------ # get_all_observations - multi-agent partial views # ------------------------------------------------------------------ def get_all_observations(self) -> dict[str, EnterpriseObservation]: """Return the latest per-agent EnterpriseObservation dict.""" result: dict[str, EnterpriseObservation] = {} for aid, obs in self._current_obs.items(): reward_total = 0.0 breakdown: dict[str, float] = {} if self._last_step_result and aid in self._last_step_result.rewards: rc = self._last_step_result.rewards[aid] reward_total = rc.total() breakdown = rc.model_dump() result[aid] = _obs_to_openenv(obs, breakdown, reward_total, self._done) return result # ------------------------------------------------------------------ # set_scenario - curriculum switching # ------------------------------------------------------------------ def set_scenario(self, scenario_name: str) -> None: """ Switch scenario for the next reset(). Args: scenario_name: e.g. "scenario_03" or full path to YAML. """ if not scenario_name.endswith(".yaml"): scenario_name = str(_ROOT / "env" / "scenarios" / f"{scenario_name}.yaml") self.inner_env._scenario_path = scenario_name print(f"[EnterpriseEnvironment] Scenario set -> {Path(scenario_name).stem}") # ------------------------------------------------------------------ # per_agent_rewards - convenience for training loops # ------------------------------------------------------------------ def per_agent_rewards(self) -> dict[str, float]: """Return per-agent scalar rewards from the last step.""" if not self._last_step_result: return {aid: 0.0 for aid in ALL_AGENTS} return { aid: rc.total() for aid, rc in self._last_step_result.rewards.items() }