| """ |
| Smoke evaluation: runs scripted actions against the environment. |
| No LLM needed. Tests that the environment, graders, and scoring work. |
| |
| Usage: |
| uv run python scripts/smoke_eval.py --task easy |
| uv run python scripts/smoke_eval.py --task medium |
| uv run python scripts/smoke_eval.py --task hard |
| uv run python scripts/smoke_eval.py --all |
| """ |
|
|
| import argparse |
| import json |
| import sys |
| from pathlib import Path |
|
|
| |
| sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) |
|
|
| from openenv.core.env_server.mcp_types import CallToolAction, ListToolsAction |
|
|
| from server.environment import IntercompanyDisputeEnvironment |
|
|
|
|
| def extract_result(obs) -> dict: |
| """Extract result dict from CallToolObservation.""" |
| r = getattr(obs, "result", None) |
| if r is None: |
| return {} |
| if hasattr(r, "structured_content") and r.structured_content: |
| return r.structured_content |
| if hasattr(r, "data") and r.data: |
| return r.data |
| content = getattr(r, "content", []) |
| for item in content: |
| text = getattr(item, "text", None) |
| if text: |
| try: |
| return json.loads(text) |
| except Exception: |
| return {"text": text} |
| return {} |
|
|
|
|
| def step(env, tool_name, **args): |
| """Convenience wrapper.""" |
| return env.step(CallToolAction(tool_name=tool_name, arguments=args)) |
|
|
|
|
| def run_easy(env, scenario: str = "smoke") -> dict: |
| """Scripted easy task: match all pairs and eliminate.""" |
| obs = env.reset(task_id="easy_batch_matching", scenario_id=scenario) |
| print(f" Task: {obs.metadata.get('description', 'easy_batch_matching')[:80]}") |
| print(f" Step limit: {obs.metadata.get('step_limit')}") |
|
|
| seed_file = Path(__file__).resolve().parent.parent / "seed_data" / "easy" / f"{scenario}.json" |
| with open(seed_file) as f: |
| scenario_data = json.load(f) |
|
|
| |
| obs = env.step(ListToolsAction()) |
| print(f" Available tools: {[t.name for t in obs.tools]}") |
|
|
| |
| obs = step(env, "query_open_items", status="open") |
| r = extract_result(obs) |
| print(f" Open items: {r.get('total_count', '?')}") |
|
|
| |
| for pair in scenario_data["ground_truth"]["required_matches"]: |
| debit_id, credit_id = pair |
| obs = step(env, "execute_match", debit_txn_id=debit_id, credit_txn_id=credit_id) |
| match_data = extract_result(obs) |
| match_id = match_data.get("match_id", "") |
| print(f" Match {debit_id} ↔ {credit_id}: {match_data.get('status')} reward={obs.reward:.2f}") |
|
|
| if match_id: |
| obs = step(env, "execute_elimination", entity_id="US_PARENT", matched_pair_id=match_id) |
| print(f" Eliminate {match_id}: {extract_result(obs).get('status')} reward={obs.reward:.2f}") |
|
|
| if env._done: |
| break |
|
|
| final_meta = obs.metadata or {} |
| score = final_meta.get("terminal_task_score", "N/A") |
| print(f" RESULT: done={obs.done}, terminal_score={score}") |
| return {"done": obs.done, "terminal_score": score} |
|
|
|
|
| def run_medium(env, scenario: str = "smoke") -> dict: |
| """Scripted medium task: fetch docs, query FX, adjust, match, eliminate.""" |
| obs = env.reset(task_id="medium_fx_variance", scenario_id=scenario) |
| print(f" Task: {obs.metadata.get('description', 'medium_fx_variance')[:80]}") |
|
|
| seed_file = Path(__file__).resolve().parent.parent / "seed_data" / "medium" / f"{scenario}.json" |
| with open(seed_file) as f: |
| scenario_data = json.load(f) |
|
|
| |
| for doc in scenario_data["documents"]: |
| obs = step(env, "fetch_document", document_id=doc["document_id"]) |
| print(f" Fetched {doc['document_id']}: reward={obs.reward:.2f}") |
|
|
| |
| |
| fx_dates_seen = set() |
| for fx in scenario_data["fx_rates"]: |
| date_key = (fx["source_currency"], fx["target_currency"], fx["rate_date"]) |
| if date_key in fx_dates_seen: |
| continue |
| if fx["rate_date"] not in [r["rate_date"] for r in scenario_data["fx_rates"] |
| if r["rate_date"] < fx["rate_date"]]: |
| |
| obs = step(env, "calculate_fx", |
| source_currency=fx["source_currency"], |
| target_currency=fx["target_currency"], |
| amount="10000", |
| conversion_date=fx["rate_date"]) |
| r = extract_result(obs) |
| if "rate" in r: |
| print(f" FX {fx['rate_date']}: rate={r.get('rate')} reward={obs.reward:.2f}") |
| fx_dates_seen.add(date_key) |
|
|
| |
| for adj in scenario_data["ground_truth"]["required_adjustments"]: |
| doc_ids = ",".join(d["document_id"] for d in scenario_data["documents"]) |
| obs = step(env, "post_adjustment", |
| entity_id=adj["entity_id"], |
| debit_account_code=adj["debit_account_code"], |
| credit_account_code=adj["credit_account_code"], |
| amount=adj["amount"], |
| currency=adj["currency"], |
| reason_code=adj["reason_code"], |
| evidence_refs=doc_ids) |
| print(f" Adjustment {adj['entity_id']} {adj['amount']}: {extract_result(obs).get('status')} reward={obs.reward:.2f}") |
|
|
| |
| for pair in scenario_data["ground_truth"]["required_matches"]: |
| obs = step(env, "execute_match", debit_txn_id=pair[0], credit_txn_id=pair[1]) |
| match_data = extract_result(obs) |
| match_id = match_data.get("match_id", "") |
| print(f" Match {pair[0]}: {match_data.get('status')} reward={obs.reward:.2f}") |
| if match_id: |
| obs = step(env, "execute_elimination", entity_id="US_PARENT", matched_pair_id=match_id) |
| print(f" Eliminate {match_id}: {extract_result(obs).get('status')} reward={obs.reward:.2f}") |
| if env._done: |
| break |
|
|
| final_meta = obs.metadata or {} |
| score = final_meta.get("terminal_task_score", "N/A") |
| print(f" RESULT: done={obs.done}, terminal_score={score}") |
| return {"done": obs.done, "terminal_score": score} |
|
|
|
|
| def run_hard(env, scenario: str = "smoke") -> dict: |
| """Scripted hard task: fetch docs, consult legal, post adjustment.""" |
| obs = env.reset(task_id="hard_liability_dispute", scenario_id=scenario) |
| print(f" Task: {obs.metadata.get('description', 'hard_liability_dispute')[:80]}") |
|
|
| seed_file = Path(__file__).resolve().parent.parent / "seed_data" / "hard" / f"{scenario}.json" |
| with open(seed_file) as f: |
| scenario_data = json.load(f) |
|
|
| |
| for doc in scenario_data["documents"]: |
| obs = step(env, "fetch_document", document_id=doc["document_id"]) |
| print(f" Fetched {doc['document_id']} ({doc['document_type']}): reward={obs.reward:.2f}") |
|
|
| |
| if scenario_data.get("legal_truth"): |
| contract_id = scenario_data["legal_truth"]["contract_document_id"] |
| obs = step(env, "ask_legal_analyst", |
| document_id=contract_id, |
| question="Who is liable for the damaged goods under this contract?") |
| r = extract_result(obs) |
| print(f" Legal consultation ({contract_id}): {r.get('liable_entity_id', 'N/A')} liable, reward={obs.reward:.2f}") |
|
|
| |
| doc_ids = ",".join(d["document_id"] for d in scenario_data["documents"]) |
| for adj in scenario_data["ground_truth"]["required_adjustments"]: |
| obs = step(env, "post_adjustment", |
| entity_id=adj["entity_id"], |
| debit_account_code=adj["debit_account_code"], |
| credit_account_code=adj["credit_account_code"], |
| amount=adj["amount"], |
| currency=adj["currency"], |
| reason_code=adj["reason_code"], |
| evidence_refs=doc_ids) |
| print(f" Adjustment {adj['entity_id']} {adj['amount']} {adj['currency']}: {extract_result(obs).get('status')} reward={obs.reward:.2f}") |
| if env._done: |
| break |
|
|
| final_meta = obs.metadata or {} |
| score = final_meta.get("terminal_task_score", "N/A") |
| print(f" RESULT: done={obs.done}, terminal_score={score}") |
| return {"done": obs.done, "terminal_score": score} |
|
|
|
|
| RUNNERS = { |
| "easy": run_easy, |
| "medium": run_medium, |
| "hard": run_hard, |
| } |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="Smoke evaluation for intercompany dispute env") |
| parser.add_argument("--task", choices=["easy", "medium", "hard"], help="Task to run") |
| parser.add_argument("--scenario", default="smoke", help="Scenario name (default: smoke)") |
| parser.add_argument("--all", action="store_true", help="Run all tasks") |
| args = parser.parse_args() |
|
|
| if not args.all and not args.task: |
| parser.error("Must specify --task or --all") |
|
|
| tasks = list(RUNNERS.keys()) if args.all else [args.task] |
| results = {} |
|
|
| for task in tasks: |
| print(f"\n{'=' * 60}") |
| print(f"Task: {task} (scenario={args.scenario})") |
| print("=" * 60) |
| env = IntercompanyDisputeEnvironment() |
| result = RUNNERS[task](env, scenario=args.scenario) |
| results[task] = result |
|
|
| if args.all: |
| print(f"\n{'=' * 60}") |
| print("SUMMARY") |
| print("=" * 60) |
| for task, res in results.items(): |
| print(f" {task}: terminal_score={res.get('terminal_score', 'N/A')}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|