OSINT / scripts /run_openai_baseline.py
siddeshwar-kagatikar
fix(rewards): never crash GRPO on malformed completions
d814291
from __future__ import annotations
import argparse
import json
import os
from osint_env.baselines import OpenAIBaselineConfig, OpenAIBaselineRunner
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="Run the reproducible OpenAI baseline on the fixed-level OSINT benchmark.")
parser.add_argument("--config", default="datasets/fixed_levels/shared_config_fixed_levels.json", help="Shared config JSON.")
parser.add_argument("--seed-file", default="datasets/fixed_levels/seed_fixed_levels.json", help="Fixed seed file JSON.")
parser.add_argument("--output", default="artifacts/baselines/openai_fixed_levels_latest.json", help="Baseline result JSON output path.")
parser.add_argument("--leaderboard", default="artifacts/baselines/openai_fixed_levels_leaderboard.json", help="Leaderboard JSON path.")
parser.add_argument("--dashboard", default="artifacts/baselines/openai_fixed_levels_dashboard.html", help="Dashboard HTML path.")
parser.add_argument("--run-name", default="openai_fixed_levels_baseline", help="Leaderboard run name.")
parser.add_argument("--model", default="gpt-5-nano", help="OpenAI chat model name.")
parser.add_argument("--openai-base-url", default="https://api.openai.com/v1", help="OpenAI-compatible base URL.")
parser.add_argument("--openai-api-key", default="", help="OpenAI API key override.")
parser.add_argument("--openai-api-key-env", default="OPENAI_API_KEY", help="Environment variable name for the API key.")
parser.add_argument("--episodes", type=int, default=30, help="Number of episodes to evaluate.")
parser.add_argument("--max-steps", type=int, default=8, help="Episode step budget to keep runs bounded.")
parser.add_argument("--temperature", type=float, default=0.0, help="Sampling temperature.")
parser.add_argument("--max-tokens", type=int, default=256, help="Maximum completion tokens per step.")
parser.add_argument("--timeout-seconds", type=int, default=60, help="Per-request timeout.")
parser.add_argument("--seed", type=int, default=7, help="Request seed offset used for repeatable runs.")
parser.add_argument("--skip-leaderboard", action="store_true", help="Do not append the run to the leaderboard file.")
return parser
def main() -> None:
args = build_parser().parse_args()
api_key = args.openai_api_key or os.getenv(args.openai_api_key_env, "")
config = OpenAIBaselineConfig(
shared_config_path=args.config,
seed_file=args.seed_file,
output_path=args.output,
leaderboard_path=args.leaderboard,
dashboard_path=args.dashboard,
run_name=args.run_name,
model=args.model,
base_url=args.openai_base_url,
api_key=api_key,
api_key_env=args.openai_api_key_env,
temperature=args.temperature,
max_tokens=args.max_tokens,
timeout_seconds=args.timeout_seconds,
episodes=args.episodes,
max_steps=args.max_steps,
seed=args.seed,
append_leaderboard=not args.skip_leaderboard,
)
result = OpenAIBaselineRunner(config).run()
print(json.dumps({"summary": result["summary"], "output": args.output, "dashboard": args.dashboard}, indent=2, sort_keys=True))
if __name__ == "__main__":
main()