Spaces:

Anurag137
/

enterprise-ops-arena

Running

App Files Files Community

enterprise-ops-arena / environment.py

Anurag137

deploy: trained LoRA toggle + training evidence tab

b9b6fea 15 days ago

raw

history blame contribute delete

21.9 kB

	"""
	environment.py - OpenEnv wrapper around EnterpriseOpsEnv.

	Imports the existing env/env.py class and exposes it via the OpenEnv
	Environment interface (reset, step, state).

	Anti-reward-hacking protections
	--------------------------------
	1. Timeout: step > 30 s -> done=True, reward = -10
	2. Loop detection: same action 3x consecutively -> reward -= 5
	3. Max-steps cap: episode ends at step 8 regardless
	4. State lock: WorldModel is read-only outside of step()
	5. Audit log: every step logged to episodes.db
	"""

	from __future__ import annotations

	import hashlib
	import json
	import os
	import sqlite3
	import sys
	import time
	from collections import deque
	from pathlib import Path
	from typing import Any, Callable, Optional

	# -- Ensure project root is importable (Handles both Local and HF layouts) ---
	_ROOT = Path(__file__).resolve().parent
	if _ROOT.name == "server":
	_ROOT = _ROOT.parent

	if str(_ROOT) not in sys.path:
	sys.path.insert(0, str(_ROOT))

	# -- OpenEnv base -----------------------------------------------------------
	try:
	from openenv.core import Environment
	except ImportError:
	class Environment:
	"""Stub when openenv-core is not installed."""
	def _reset_rubric(self) -> None:
	pass

	# -- Existing enterprise_ops imports (flat, relative to _ROOT) --------------
	from contracts import (
	ActionSchema,
	ObservationSchema,
	RewardComponents,
	StepResult as InnerStepResult,
	AGENT_IT,
	AGENT_MANAGER,
	AGENT_FINANCE,
	AGENT_OVERSIGHT,
	)
	from env.env import EnterpriseOpsEnv
	from agents import ITAgent, ManagerAgent, FinanceAgent
	from agents.trained_agent import TrainedITAgent
	from models import EnterpriseAction, EnterpriseObservation

	# ---------------------------------------------------------------------------
	# Constants
	# ---------------------------------------------------------------------------

	ALL_AGENTS = [AGENT_IT, AGENT_MANAGER, AGENT_FINANCE, AGENT_OVERSIGHT]

	STEP_TIMEOUT_S = 30.0
	LOOP_WINDOW = 3
	LOOP_PENALTY = -5.0
	MAX_STEPS_HARD_CAP = int(os.environ.get("MAX_STEPS", "8"))
	TIMEOUT_PENALTY = -10.0

	# ---------------------------------------------------------------------------
	# Helpers
	# ---------------------------------------------------------------------------


	def _action_fingerprint(action: EnterpriseAction) -> str:
	"""Deterministic hash of an action for loop detection."""
	raw = (
	f"{action.agent_id}\|{action.tool_call}\|"
	f"{json.dumps(action.tool_params, sort_keys=True, default=str)}\|"
	f"{action.message_to}"
	)
	return hashlib.md5(raw.encode()).hexdigest()


	def _obs_to_openenv(
	obs: ObservationSchema,
	reward_breakdown: dict[str, float],
	reward: float,
	done: bool,
	) -> EnterpriseObservation:
	"""Convert a Pydantic ObservationSchema -> dataclass EnterpriseObservation."""
	return EnterpriseObservation(
	agent_id=obs.agent_id,
	inbox=[m.model_dump() for m in obs.inbox],
	tickets=[t.model_dump() for t in obs.tickets],
	resource_pool=obs.resource_pool.model_dump() if obs.resource_pool else None,
	active_deals=[d.model_dump() for d in obs.active_deals],
	project_tasks=[t.model_dump() for t in obs.project_tasks],
	step_number=obs.step_number,
	schema_version=obs.schema_version,
	reward_breakdown=reward_breakdown,
	reward=reward,
	done=done,
	)


	# ---------------------------------------------------------------------------
	# Default reward function (Ayush overrides via env.reward_fn = custom_fn)
	# ---------------------------------------------------------------------------


	def default_reward_fn(
	step_result: InnerStepResult,
	actions: dict[str, ActionSchema],
	) -> float:
	"""Sum of all agents' RewardComponents.total()."""
	return sum(r.total() for r in step_result.rewards.values())


	# ---------------------------------------------------------------------------
	# Audit logger
	# ---------------------------------------------------------------------------


	def _ensure_audit_table(db_path: str) -> None:
	"""Create the audit table if it does not exist."""
	with sqlite3.connect(db_path) as conn:
	conn.execute("""
	CREATE TABLE IF NOT EXISTS step_audit (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	timestamp REAL NOT NULL,
	event TEXT NOT NULL,
	step INTEGER,
	agent_id TEXT,
	action_json TEXT,
	reward REAL,
	done INTEGER,
	info_json TEXT
	)
	""")
	conn.commit()


	def _log_audit(
	db_path: str,
	event: str,
	step: int = 0,
	agent_id: str = "",
	action_json: str = "{}",
	reward: float = 0.0,
	done: bool = False,
	info_json: str = "{}",
	) -> None:
	"""Write one row to the step_audit table."""
	try:
	with sqlite3.connect(db_path) as conn:
	conn.execute(
	"INSERT INTO step_audit "
	"(timestamp, event, step, agent_id, action_json, reward, done, info_json) "
	"VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
	(time.time(), event, step, agent_id, action_json, reward, int(done), info_json),
	)
	conn.commit()
	except Exception:
	pass # audit must never crash the env


	# ===========================================================================
	# EnterpriseEnvironment - OpenEnv wrapper
	# ===========================================================================


	class EnterpriseEnvironment(Environment):
	"""
	OpenEnv-compatible wrapper around the existing EnterpriseOpsEnv.

	Usage::

	env = EnterpriseEnvironment()
	obs = env.reset()
	result = env.step(EnterpriseAction(agent_id="it_agent", tool_call="get_tickets"))

	For multi-agent training, call ``get_all_observations()`` after each
	``step()`` to retrieve every agent's partial view.

	Anti-reward-hacking:
	- 30 s timeout per step
	- 3x identical-action loop -> -5 reward
	- Hard cap at 8 steps
	- All steps audited in episodes.db
	"""

	def __init__(
	self,
	scenario_path: Optional[str] = None,
	seed: int = 42,
	max_steps: int = MAX_STEPS_HARD_CAP,
	noise_rate: float = 0.08,
	db_path: str = "episodes.db",
	) -> None:
	super().__init__()

	if scenario_path is None:
	scenario_path = str(_ROOT / "env" / "scenarios" / "scenario_01.yaml")

	self.inner_env = EnterpriseOpsEnv(
	scenario_path=scenario_path,
	seed=seed,
	max_steps=max_steps,
	noise_rate=noise_rate,
	db_path=db_path,
	)

	# -- Pluggable reward - INTERFACE STABLE (Ayush overrides this) -----
	self.reward_fn: Callable[..., float] = default_reward_fn

	# -- IT: rule-based by default; optional HuggingFace LoRA via set_use_trained_it()
	self._rule_it_agent: Any = ITAgent()
	self._trained_it_agent: Any \| None = None
	self._use_trained_it: bool = False

	# -- Rule-based fallback agents for untrained roles -----------------
	self._fallback_agents: dict[str, Any] = {
	AGENT_IT: self._rule_it_agent,
	AGENT_MANAGER: ManagerAgent(AGENT_MANAGER),
	AGENT_FINANCE: FinanceAgent(AGENT_FINANCE),
	}

	# -- Anti-hacking state ---------------------------------------------
	self._action_history: deque[str] = deque(maxlen=20)
	self._step_count: int = 0
	self._max_steps: int = max_steps
	self._done: bool = False
	self._db_path: str = db_path

	# -- Observation cache ----------------------------------------------
	self._current_obs: dict[str, ObservationSchema] = {}
	self._last_step_result: Optional[InnerStepResult] = None

	_ensure_audit_table(db_path)
	print(
	f"[EnterpriseEnvironment] Wrapper ready \| "
	f"scenario={Path(scenario_path).stem} \| "
	f"max_steps={max_steps} \| db={db_path}"
	)

	def set_use_trained_it(self, use: bool) -> None:
	"""When True, IT fallback uses TrainedITAgent (LoRA); when False, rule-based ITAgent."""
	self._use_trained_it = use
	if use:
	if self._trained_it_agent is None:
	self._trained_it_agent = TrainedITAgent()
	self._fallback_agents[AGENT_IT] = self._trained_it_agent
	else:
	self._fallback_agents[AGENT_IT] = self._rule_it_agent

	def it_agent_status(self) -> str:
	"""Short status string for the Gradio UI."""
	if not self._use_trained_it:
	return "Rule-based agents active"
	if self._trained_it_agent is not None and self._trained_it_agent.model is not None:
	return "Trained LoRA model active (HuggingFace)"
	return "Trained mode on — LoRA not loaded, using rule-based fallback"

	# ------------------------------------------------------------------
	# reset
	# ------------------------------------------------------------------

	def reset(
	self,
	scenario: Optional[str] = None,
	seed: Optional[int] = None,
	use_trained_model: bool = False,
	) -> EnterpriseObservation:
	"""Reset the environment. Returns manager-agent view as primary obs."""
	self.set_use_trained_it(use_trained_model)

	kwargs: dict[str, Any] = {}
	if scenario:
	if not scenario.endswith(".yaml"):
	scenario = str(_ROOT / "env" / "scenarios" / f"{scenario}.yaml")
	kwargs["scenario"] = scenario
	if seed is not None:
	kwargs["seed"] = seed

	self._current_obs = self.inner_env.reset(**kwargs)
	self._action_history.clear()
	self._step_count = 0
	self._done = False
	self._last_step_result = None

	_log_audit(self._db_path, "reset", info_json=json.dumps(kwargs, default=str))

	return _obs_to_openenv(
	self._current_obs[AGENT_MANAGER],
	reward_breakdown={},
	reward=0.0,
	done=False,
	)

	# ------------------------------------------------------------------
	# step (single-agent primary interface)
	# ------------------------------------------------------------------

	def step(
	self,
	action: EnterpriseAction,
	use_trained_model: Optional[bool] = None,
	) -> dict[str, Any]:
	"""
	Execute one environment step.

	The provided ``action`` controls one agent; all other agents use
	their rule-based fallback policies.

	Returns
	-------
	dict with keys: observation, reward, done, info
	"""

	if use_trained_model is not None:
	self.set_use_trained_it(use_trained_model)

	# -- 1. Hard cap ----------------------------------------------------
	if self._done or self._step_count >= self._max_steps:
	self._done = True
	fallback_obs = next(iter(self._current_obs.values()))
	obs = _obs_to_openenv(fallback_obs, {}, 0.0, True)
	return {"observation": obs, "reward": 0.0, "done": True,
	"info": {"reason": "max_steps_exceeded", "it_agent_status": self.it_agent_status()}}

	start_time = time.time()
	reward_penalty = 0.0

	# -- 2. Loop detection ----------------------------------------------
	fp = _action_fingerprint(action)
	self._action_history.append(fp)
	if len(self._action_history) >= LOOP_WINDOW:
	tail = list(self._action_history)[-LOOP_WINDOW:]
	if len(set(tail)) == 1:
	reward_penalty += LOOP_PENALTY

	# -- 3. Build full action dict --------------------------------------
	primary = ActionSchema(
	tool_call=action.tool_call,
	tool_params=action.tool_params or {},
	message_to=action.message_to,
	message_content=action.message_content,
	reasoning=action.reasoning,
	)

	all_actions: dict[str, ActionSchema] = {}
	for aid in ALL_AGENTS:
	if aid == action.agent_id:
	all_actions[aid] = primary
	elif aid == AGENT_OVERSIGHT:
	all_actions[aid] = ActionSchema()
	elif aid in self._fallback_agents and aid in self._current_obs:
	all_actions[aid] = self._fallback_agents[aid].act(self._current_obs[aid])
	else:
	all_actions[aid] = ActionSchema()

	# -- 4. Execute inner step ------------------------------------------
	inner_result: InnerStepResult = self.inner_env.step(all_actions)
	elapsed = time.time() - start_time

	# -- 5. Timeout check ----------------------------------------------
	if elapsed > STEP_TIMEOUT_S:
	self._done = True
	obs = _obs_to_openenv(
	inner_result.observations.get(AGENT_MANAGER, next(iter(inner_result.observations.values()))),
	{}, TIMEOUT_PENALTY, True,
	)
	_log_audit(self._db_path, "timeout", self._step_count,
	action.agent_id, json.dumps(action.to_dict(), default=str),
	TIMEOUT_PENALTY, True, json.dumps({"elapsed": elapsed}))
	return {"observation": obs, "reward": TIMEOUT_PENALTY, "done": True,
	"info": {
	"reason": "timeout",
	"elapsed_s": elapsed,
	"it_agent_status": self.it_agent_status(),
	}}

	# -- 6. Compute reward ----------------------------------------------
	base_reward = self.reward_fn(inner_result, all_actions)
	total_reward = base_reward + reward_penalty

	# -- 7. Build reward breakdown --------------------------------------
	breakdown: dict[str, float] = {}
	for aid, rc in inner_result.rewards.items():
	breakdown[f"{aid}_total"] = rc.total()
	breakdown[f"{aid}_task"] = rc.task_completion
	breakdown[f"{aid}_sla"] = rc.sla_adherence
	breakdown[f"{aid}_coord"] = rc.coordination_bonus
	breakdown[f"{aid}_drift"] = rc.schema_adaptation
	breakdown[f"{aid}_breach"] = rc.sla_breach_penalty
	breakdown[f"{aid}_halluc"] = rc.hallucination_penalty
	breakdown[f"{aid}_oversight"] = rc.oversight_detection
	if reward_penalty != 0.0:
	breakdown["loop_penalty"] = reward_penalty

	# -- 8. Update state -----------------------------------------------
	self._current_obs = inner_result.observations
	self._last_step_result = inner_result
	self._step_count += 1
	self._done = inner_result.done or self._step_count >= self._max_steps

	# -- 9. Build primary observation ----------------------------------
	primary_agent = action.agent_id if action.agent_id in inner_result.observations else AGENT_MANAGER
	obs = _obs_to_openenv(
	inner_result.observations[primary_agent],
	breakdown, total_reward, self._done,
	)

	# -- 10. Audit log -------------------------------------------------
	_log_audit(
	self._db_path, "step", self._step_count, action.agent_id,
	json.dumps(action.to_dict(), default=str), total_reward, self._done,
	json.dumps({
	"elapsed_s": elapsed,
	"loop_penalty": reward_penalty,
	"oversight_flags": inner_result.info.get("oversight_flags", []),
	"schema_version": inner_result.info.get("schema_version", 1),
	}, default=str),
	)

	return {
	"observation": obs,
	"reward": total_reward,
	"done": self._done,
	"info": {
	"step": self._step_count,
	"elapsed_s": elapsed,
	"loop_penalty": reward_penalty,
	"oversight_flags": inner_result.info.get("oversight_flags", []),
	"schema_version": inner_result.info.get("schema_version", 1),
	"tool_results": inner_result.info.get("tool_results", {}),
	"it_agent_status": self.it_agent_status(),
	},
	}

	# ------------------------------------------------------------------
	# step_multi - all agents in one call (for multi-agent training)
	# ------------------------------------------------------------------

	def step_multi(
	self, actions: dict[str, EnterpriseAction],
	) -> dict[str, Any]:
	"""
	Execute one step with explicit actions for multiple agents.

	Agents not in ``actions`` use their fallback rule policy.

	Returns dict with: observations (per-agent), reward, done, info
	"""
	if self._done or self._step_count >= self._max_steps:
	self._done = True
	return {"observations": {}, "reward": 0.0, "done": True,
	"info": {"reason": "max_steps_exceeded"}}

	start_time = time.time()

	# Build full action dict
	all_actions: dict[str, ActionSchema] = {}
	for aid in ALL_AGENTS:
	if aid in actions:
	a = actions[aid]
	all_actions[aid] = ActionSchema(
	tool_call=a.tool_call,
	tool_params=a.tool_params or {},
	message_to=a.message_to,
	message_content=a.message_content,
	reasoning=a.reasoning,
	)
	elif aid == AGENT_OVERSIGHT:
	all_actions[aid] = ActionSchema()
	elif aid in self._fallback_agents and aid in self._current_obs:
	all_actions[aid] = self._fallback_agents[aid].act(self._current_obs[aid])
	else:
	all_actions[aid] = ActionSchema()

	inner_result = self.inner_env.step(all_actions)
	elapsed = time.time() - start_time

	if elapsed > STEP_TIMEOUT_S:
	self._done = True
	return {"observations": {}, "reward": TIMEOUT_PENALTY, "done": True,
	"info": {"reason": "timeout", "elapsed_s": elapsed}}

	base_reward = self.reward_fn(inner_result, all_actions)
	self._current_obs = inner_result.observations
	self._last_step_result = inner_result
	self._step_count += 1
	self._done = inner_result.done or self._step_count >= self._max_steps

	per_agent_obs = {
	aid: _obs_to_openenv(
	obs, {k: v for k, v in inner_result.rewards.get(aid, RewardComponents()).model_dump().items()},
	inner_result.rewards.get(aid, RewardComponents()).total(),
	self._done,
	)
	for aid, obs in inner_result.observations.items()
	}

	return {
	"observations": per_agent_obs,
	"reward": base_reward,
	"done": self._done,
	"info": {
	"step": self._step_count,
	"elapsed_s": elapsed,
	"schema_version": inner_result.info.get("schema_version", 1),
	},
	}

	# ------------------------------------------------------------------
	# state property
	# ------------------------------------------------------------------

	@property
	def state(self) -> dict[str, Any]:
	"""Full world state snapshot (read-only view)."""
	raw = self.inner_env.state
	raw["wrapper_step_count"] = self._step_count
	raw["wrapper_done"] = self._done
	return raw

	# ------------------------------------------------------------------
	# get_all_observations - multi-agent partial views
	# ------------------------------------------------------------------

	def get_all_observations(self) -> dict[str, EnterpriseObservation]:
	"""Return the latest per-agent EnterpriseObservation dict."""
	result: dict[str, EnterpriseObservation] = {}
	for aid, obs in self._current_obs.items():
	reward_total = 0.0
	breakdown: dict[str, float] = {}
	if self._last_step_result and aid in self._last_step_result.rewards:
	rc = self._last_step_result.rewards[aid]
	reward_total = rc.total()
	breakdown = rc.model_dump()
	result[aid] = _obs_to_openenv(obs, breakdown, reward_total, self._done)
	return result

	# ------------------------------------------------------------------
	# set_scenario - curriculum switching
	# ------------------------------------------------------------------

	def set_scenario(self, scenario_name: str) -> None:
	"""
	Switch scenario for the next reset().

	Args:
	scenario_name: e.g. "scenario_03" or full path to YAML.
	"""
	if not scenario_name.endswith(".yaml"):
	scenario_name = str(_ROOT / "env" / "scenarios" / f"{scenario_name}.yaml")
	self.inner_env._scenario_path = scenario_name
	print(f"[EnterpriseEnvironment] Scenario set -> {Path(scenario_name).stem}")

	# ------------------------------------------------------------------
	# per_agent_rewards - convenience for training loops
	# ------------------------------------------------------------------

	def per_agent_rewards(self) -> dict[str, float]:
	"""Return per-agent scalar rewards from the last step."""
	if not self._last_step_result:
	return {aid: 0.0 for aid in ALL_AGENTS}
	return {
	aid: rc.total()
	for aid, rc in self._last_step_result.rewards.items()
	}