Sahil Sahu
built all phases as per the built plan
8df8372
"""
Smoke evaluation: runs scripted actions against the environment.
No LLM needed. Tests that the environment, graders, and scoring work.
Usage:
uv run python scripts/smoke_eval.py --task easy
uv run python scripts/smoke_eval.py --task medium
uv run python scripts/smoke_eval.py --task hard
uv run python scripts/smoke_eval.py --all
"""
import argparse
import json
import sys
from pathlib import Path
# Add project root to path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from openenv.core.env_server.mcp_types import CallToolAction, ListToolsAction
from server.environment import IntercompanyDisputeEnvironment
def extract_result(obs) -> dict:
"""Extract result dict from CallToolObservation."""
r = getattr(obs, "result", None)
if r is None:
return {}
if hasattr(r, "structured_content") and r.structured_content:
return r.structured_content
if hasattr(r, "data") and r.data:
return r.data
content = getattr(r, "content", [])
for item in content:
text = getattr(item, "text", None)
if text:
try:
return json.loads(text)
except Exception:
return {"text": text}
return {}
def step(env, tool_name, **args):
"""Convenience wrapper."""
return env.step(CallToolAction(tool_name=tool_name, arguments=args))
def run_easy(env, scenario: str = "smoke") -> dict:
"""Scripted easy task: match all pairs and eliminate."""
obs = env.reset(task_id="easy_batch_matching", scenario_id=scenario)
print(f" Task: {obs.metadata.get('description', 'easy_batch_matching')[:80]}")
print(f" Step limit: {obs.metadata.get('step_limit')}")
seed_file = Path(__file__).resolve().parent.parent / "seed_data" / "easy" / f"{scenario}.json"
with open(seed_file) as f:
scenario_data = json.load(f)
# List tools
obs = env.step(ListToolsAction())
print(f" Available tools: {[t.name for t in obs.tools]}")
# Query open items
obs = step(env, "query_open_items", status="open")
r = extract_result(obs)
print(f" Open items: {r.get('total_count', '?')}")
# Match and eliminate each pair
for pair in scenario_data["ground_truth"]["required_matches"]:
debit_id, credit_id = pair
obs = step(env, "execute_match", debit_txn_id=debit_id, credit_txn_id=credit_id)
match_data = extract_result(obs)
match_id = match_data.get("match_id", "")
print(f" Match {debit_id}{credit_id}: {match_data.get('status')} reward={obs.reward:.2f}")
if match_id:
obs = step(env, "execute_elimination", entity_id="US_PARENT", matched_pair_id=match_id)
print(f" Eliminate {match_id}: {extract_result(obs).get('status')} reward={obs.reward:.2f}")
if env._done:
break
final_meta = obs.metadata or {}
score = final_meta.get("terminal_task_score", "N/A")
print(f" RESULT: done={obs.done}, terminal_score={score}")
return {"done": obs.done, "terminal_score": score}
def run_medium(env, scenario: str = "smoke") -> dict:
"""Scripted medium task: fetch docs, query FX, adjust, match, eliminate."""
obs = env.reset(task_id="medium_fx_variance", scenario_id=scenario)
print(f" Task: {obs.metadata.get('description', 'medium_fx_variance')[:80]}")
seed_file = Path(__file__).resolve().parent.parent / "seed_data" / "medium" / f"{scenario}.json"
with open(seed_file) as f:
scenario_data = json.load(f)
# Fetch all documents (evidence gathering)
for doc in scenario_data["documents"]:
obs = step(env, "fetch_document", document_id=doc["document_id"])
print(f" Fetched {doc['document_id']}: reward={obs.reward:.2f}")
# Query FX rates from the fx_rates table (use settlement dates)
# Settlement dates are: booking_date + 30 days (from invoice body)
fx_dates_seen = set()
for fx in scenario_data["fx_rates"]:
date_key = (fx["source_currency"], fx["target_currency"], fx["rate_date"])
if date_key in fx_dates_seen:
continue
if fx["rate_date"] not in [r["rate_date"] for r in scenario_data["fx_rates"]
if r["rate_date"] < fx["rate_date"]]:
# Use as a settlement date query
obs = step(env, "calculate_fx",
source_currency=fx["source_currency"],
target_currency=fx["target_currency"],
amount="10000",
conversion_date=fx["rate_date"])
r = extract_result(obs)
if "rate" in r:
print(f" FX {fx['rate_date']}: rate={r.get('rate')} reward={obs.reward:.2f}")
fx_dates_seen.add(date_key)
# Post adjustments from ground truth
for adj in scenario_data["ground_truth"]["required_adjustments"]:
doc_ids = ",".join(d["document_id"] for d in scenario_data["documents"])
obs = step(env, "post_adjustment",
entity_id=adj["entity_id"],
debit_account_code=adj["debit_account_code"],
credit_account_code=adj["credit_account_code"],
amount=adj["amount"],
currency=adj["currency"],
reason_code=adj["reason_code"],
evidence_refs=doc_ids)
print(f" Adjustment {adj['entity_id']} {adj['amount']}: {extract_result(obs).get('status')} reward={obs.reward:.2f}")
# Match and eliminate
for pair in scenario_data["ground_truth"]["required_matches"]:
obs = step(env, "execute_match", debit_txn_id=pair[0], credit_txn_id=pair[1])
match_data = extract_result(obs)
match_id = match_data.get("match_id", "")
print(f" Match {pair[0]}: {match_data.get('status')} reward={obs.reward:.2f}")
if match_id:
obs = step(env, "execute_elimination", entity_id="US_PARENT", matched_pair_id=match_id)
print(f" Eliminate {match_id}: {extract_result(obs).get('status')} reward={obs.reward:.2f}")
if env._done:
break
final_meta = obs.metadata or {}
score = final_meta.get("terminal_task_score", "N/A")
print(f" RESULT: done={obs.done}, terminal_score={score}")
return {"done": obs.done, "terminal_score": score}
def run_hard(env, scenario: str = "smoke") -> dict:
"""Scripted hard task: fetch docs, consult legal, post adjustment."""
obs = env.reset(task_id="hard_liability_dispute", scenario_id=scenario)
print(f" Task: {obs.metadata.get('description', 'hard_liability_dispute')[:80]}")
seed_file = Path(__file__).resolve().parent.parent / "seed_data" / "hard" / f"{scenario}.json"
with open(seed_file) as f:
scenario_data = json.load(f)
# Fetch all documents
for doc in scenario_data["documents"]:
obs = step(env, "fetch_document", document_id=doc["document_id"])
print(f" Fetched {doc['document_id']} ({doc['document_type']}): reward={obs.reward:.2f}")
# Consult legal analyst
if scenario_data.get("legal_truth"):
contract_id = scenario_data["legal_truth"]["contract_document_id"]
obs = step(env, "ask_legal_analyst",
document_id=contract_id,
question="Who is liable for the damaged goods under this contract?")
r = extract_result(obs)
print(f" Legal consultation ({contract_id}): {r.get('liable_entity_id', 'N/A')} liable, reward={obs.reward:.2f}")
# Post adjustments from ground truth
doc_ids = ",".join(d["document_id"] for d in scenario_data["documents"])
for adj in scenario_data["ground_truth"]["required_adjustments"]:
obs = step(env, "post_adjustment",
entity_id=adj["entity_id"],
debit_account_code=adj["debit_account_code"],
credit_account_code=adj["credit_account_code"],
amount=adj["amount"],
currency=adj["currency"],
reason_code=adj["reason_code"],
evidence_refs=doc_ids)
print(f" Adjustment {adj['entity_id']} {adj['amount']} {adj['currency']}: {extract_result(obs).get('status')} reward={obs.reward:.2f}")
if env._done:
break
final_meta = obs.metadata or {}
score = final_meta.get("terminal_task_score", "N/A")
print(f" RESULT: done={obs.done}, terminal_score={score}")
return {"done": obs.done, "terminal_score": score}
RUNNERS = {
"easy": run_easy,
"medium": run_medium,
"hard": run_hard,
}
def main():
parser = argparse.ArgumentParser(description="Smoke evaluation for intercompany dispute env")
parser.add_argument("--task", choices=["easy", "medium", "hard"], help="Task to run")
parser.add_argument("--scenario", default="smoke", help="Scenario name (default: smoke)")
parser.add_argument("--all", action="store_true", help="Run all tasks")
args = parser.parse_args()
if not args.all and not args.task:
parser.error("Must specify --task or --all")
tasks = list(RUNNERS.keys()) if args.all else [args.task]
results = {}
for task in tasks:
print(f"\n{'=' * 60}")
print(f"Task: {task} (scenario={args.scenario})")
print("=" * 60)
env = IntercompanyDisputeEnvironment()
result = RUNNERS[task](env, scenario=args.scenario)
results[task] = result
if args.all:
print(f"\n{'=' * 60}")
print("SUMMARY")
print("=" * 60)
for task, res in results.items():
print(f" {task}: terminal_score={res.get('terminal_score', 'N/A')}")
if __name__ == "__main__":
main()