File size: 3,298 Bytes
db4fa53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from __future__ import annotations

import argparse
import json
import os

from osint_env.baselines import OpenAIBaselineConfig, OpenAIBaselineRunner


def build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(description="Run the reproducible OpenAI baseline on the fixed-level OSINT benchmark.")
    parser.add_argument("--config", default="datasets/fixed_levels/shared_config_fixed_levels.json", help="Shared config JSON.")
    parser.add_argument("--seed-file", default="datasets/fixed_levels/seed_fixed_levels.json", help="Fixed seed file JSON.")
    parser.add_argument("--output", default="artifacts/baselines/openai_fixed_levels_latest.json", help="Baseline result JSON output path.")
    parser.add_argument("--leaderboard", default="artifacts/baselines/openai_fixed_levels_leaderboard.json", help="Leaderboard JSON path.")
    parser.add_argument("--dashboard", default="artifacts/baselines/openai_fixed_levels_dashboard.html", help="Dashboard HTML path.")
    parser.add_argument("--run-name", default="openai_fixed_levels_baseline", help="Leaderboard run name.")
    parser.add_argument("--model", default="gpt-5-nano", help="OpenAI chat model name.")
    parser.add_argument("--openai-base-url", default="https://api.openai.com/v1", help="OpenAI-compatible base URL.")
    parser.add_argument("--openai-api-key", default="", help="OpenAI API key override.")
    parser.add_argument("--openai-api-key-env", default="OPENAI_API_KEY", help="Environment variable name for the API key.")
    parser.add_argument("--episodes", type=int, default=30, help="Number of episodes to evaluate.")
    parser.add_argument("--max-steps", type=int, default=8, help="Episode step budget to keep runs bounded.")
    parser.add_argument("--temperature", type=float, default=0.0, help="Sampling temperature.")
    parser.add_argument("--max-tokens", type=int, default=256, help="Maximum completion tokens per step.")
    parser.add_argument("--timeout-seconds", type=int, default=60, help="Per-request timeout.")
    parser.add_argument("--seed", type=int, default=7, help="Request seed offset used for repeatable runs.")
    parser.add_argument("--skip-leaderboard", action="store_true", help="Do not append the run to the leaderboard file.")
    return parser


def main() -> None:
    args = build_parser().parse_args()
    api_key = args.openai_api_key or os.getenv(args.openai_api_key_env, "")
    config = OpenAIBaselineConfig(
        shared_config_path=args.config,
        seed_file=args.seed_file,
        output_path=args.output,
        leaderboard_path=args.leaderboard,
        dashboard_path=args.dashboard,
        run_name=args.run_name,
        model=args.model,
        base_url=args.openai_base_url,
        api_key=api_key,
        api_key_env=args.openai_api_key_env,
        temperature=args.temperature,
        max_tokens=args.max_tokens,
        timeout_seconds=args.timeout_seconds,
        episodes=args.episodes,
        max_steps=args.max_steps,
        seed=args.seed,
        append_leaderboard=not args.skip_leaderboard,
    )
    result = OpenAIBaselineRunner(config).run()
    print(json.dumps({"summary": result["summary"], "output": args.output, "dashboard": args.dashboard}, indent=2, sort_keys=True))


if __name__ == "__main__":
    main()