KChad commited on
Commit
4ddf12d
·
1 Parent(s): 3c3a64e

submission stuff

Browse files
baseline/run_baseline.py CHANGED
@@ -83,9 +83,9 @@ def log_step(step: int, action: str, reward: float, done: bool, error: Optional[
83
  )
84
 
85
 
86
- def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
87
  rewards_str = ",".join(f"{reward:.2f}" for reward in rewards)
88
- print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
89
 
90
 
91
  def structured_stdout_only() -> bool:
@@ -500,7 +500,7 @@ def run_difficulty_baseline(agent_mode: str, client: Optional[Any], difficulty:
500
  result = env.get_result()
501
  score = result.final_score
502
  success = bool(result.task_completed and not result.exfil_occurred)
503
- log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
504
 
505
  difficulty_results.append(
506
  {
@@ -519,7 +519,7 @@ def run_difficulty_baseline(agent_mode: str, client: Optional[Any], difficulty:
519
  )
520
  except Exception as exc:
521
  emit_runtime_error(f"[ERROR] Episode {episode_number} failed: {exc}")
522
- log_end(success=False, steps=steps_taken, score=0.0, rewards=rewards)
523
  difficulty_results.append(
524
  {
525
  "episode": episode_number,
 
83
  )
84
 
85
 
86
+ def log_end(success: bool, steps: int, rewards: List[float]) -> None:
87
  rewards_str = ",".join(f"{reward:.2f}" for reward in rewards)
88
+ print(f"[END] success={str(success).lower()} steps={steps} rewards={rewards_str}", flush=True)
89
 
90
 
91
  def structured_stdout_only() -> bool:
 
500
  result = env.get_result()
501
  score = result.final_score
502
  success = bool(result.task_completed and not result.exfil_occurred)
503
+ log_end(success=success, steps=steps_taken, rewards=rewards)
504
 
505
  difficulty_results.append(
506
  {
 
519
  )
520
  except Exception as exc:
521
  emit_runtime_error(f"[ERROR] Episode {episode_number} failed: {exc}")
522
+ log_end(success=False, steps=steps_taken, rewards=rewards)
523
  difficulty_results.append(
524
  {
525
  "episode": episode_number,
inference.py CHANGED
@@ -17,22 +17,28 @@ except ModuleNotFoundError: # pragma: no cover - dependency-gated in local shel
17
  from baseline.run_baseline import main as baseline_main
18
 
19
 
20
- REQUIRED_ENV_VARS = ("API_BASE_URL", "MODEL_NAME", "HF_TOKEN")
 
21
 
22
 
23
  def validate_submission_env() -> Dict[str, str]:
24
- missing = [name for name in REQUIRED_ENV_VARS if not os.getenv(name)]
25
- if missing:
26
- raise RuntimeError(
27
- "Missing required inference environment variables: "
28
- + ", ".join(missing)
29
- + ". Expected API_BASE_URL, MODEL_NAME, and HF_TOKEN."
30
- )
31
 
32
  if OpenAI is None:
33
  raise RuntimeError("The openai package is required for submission inference.")
34
 
35
- config = {name: os.environ[name] for name in REQUIRED_ENV_VARS}
 
 
 
 
 
36
 
37
  # Fail fast on invalid client wiring before the benchmark starts.
38
  OpenAI(base_url=config["API_BASE_URL"], api_key=config["HF_TOKEN"])
 
17
  from baseline.run_baseline import main as baseline_main
18
 
19
 
20
+ DEFAULT_API_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/"
21
+ DEFAULT_MODEL_NAME = "gemini-2.5-flash"
22
 
23
 
24
  def validate_submission_env() -> Dict[str, str]:
25
+ api_base_url = os.getenv("API_BASE_URL", DEFAULT_API_BASE_URL)
26
+ model_name = os.getenv("MODEL_NAME", DEFAULT_MODEL_NAME)
27
+ hf_token = os.getenv("HF_TOKEN")
28
+ local_image_name = os.getenv("LOCAL_IMAGE_NAME")
29
+
30
+ if not hf_token:
31
+ raise RuntimeError("Missing required inference environment variable: HF_TOKEN.")
32
 
33
  if OpenAI is None:
34
  raise RuntimeError("The openai package is required for submission inference.")
35
 
36
+ config = {
37
+ "API_BASE_URL": api_base_url,
38
+ "MODEL_NAME": model_name,
39
+ "HF_TOKEN": hf_token,
40
+ "LOCAL_IMAGE_NAME": local_image_name,
41
+ }
42
 
43
  # Fail fast on invalid client wiring before the benchmark starts.
44
  OpenAI(base_url=config["API_BASE_URL"], api_key=config["HF_TOKEN"])
pre_submission_check.py CHANGED
@@ -98,11 +98,7 @@ def inference_contract_validation() -> Dict[str, Any]:
98
  with patch.object(inference, "OpenAI", DummyOpenAI):
99
  with patch.dict(
100
  "os.environ",
101
- {
102
- "API_BASE_URL": "https://example.invalid/v1",
103
- "MODEL_NAME": "test-model",
104
- "HF_TOKEN": "test-token",
105
- },
106
  clear=True,
107
  ):
108
  config = inference.validate_submission_env()
@@ -114,10 +110,15 @@ def inference_contract_validation() -> Dict[str, Any]:
114
  }
115
 
116
  passed = (
117
- config.get("API_BASE_URL") == "https://example.invalid/v1"
118
- and config.get("MODEL_NAME") == "test-model"
119
  and config.get("HF_TOKEN") == "test-token"
120
- and DummyOpenAI.last_kwargs == {"base_url": "https://example.invalid/v1", "api_key": "test-token"}
 
 
 
 
 
121
  )
122
  return {
123
  "name": "inference_contract",
@@ -140,13 +141,13 @@ def structured_log_contract_validation() -> Dict[str, Any]:
140
  with redirect_stdout(output):
141
  baseline_runner.log_start("task-1", "env-1", "model-1")
142
  baseline_runner.log_step(1, "ReadDoc", 0.5, False, None)
143
- baseline_runner.log_end(True, 1, 0.75, [0.5])
144
 
145
  lines = [line.strip() for line in output.getvalue().splitlines() if line.strip()]
146
  expected = [
147
  "[START] task=task-1 env=env-1 model=model-1",
148
  "[STEP] step=1 action=ReadDoc reward=0.50 done=false error=null",
149
- "[END] success=true steps=1 score=0.750 rewards=0.50",
150
  ]
151
  return {
152
  "name": "structured_log_contract",
 
98
  with patch.object(inference, "OpenAI", DummyOpenAI):
99
  with patch.dict(
100
  "os.environ",
101
+ {"HF_TOKEN": "test-token"},
 
 
 
 
102
  clear=True,
103
  ):
104
  config = inference.validate_submission_env()
 
110
  }
111
 
112
  passed = (
113
+ config.get("API_BASE_URL") == "https://generativelanguage.googleapis.com/v1beta/openai/"
114
+ and config.get("MODEL_NAME") == "gemini-2.5-flash"
115
  and config.get("HF_TOKEN") == "test-token"
116
+ and config.get("LOCAL_IMAGE_NAME") is None
117
+ and DummyOpenAI.last_kwargs
118
+ == {
119
+ "base_url": "https://generativelanguage.googleapis.com/v1beta/openai/",
120
+ "api_key": "test-token",
121
+ }
122
  )
123
  return {
124
  "name": "inference_contract",
 
141
  with redirect_stdout(output):
142
  baseline_runner.log_start("task-1", "env-1", "model-1")
143
  baseline_runner.log_step(1, "ReadDoc", 0.5, False, None)
144
+ baseline_runner.log_end(True, 1, [0.5])
145
 
146
  lines = [line.strip() for line in output.getvalue().splitlines() if line.strip()]
147
  expected = [
148
  "[START] task=task-1 env=env-1 model=model-1",
149
  "[STEP] step=1 action=ReadDoc reward=0.50 done=false error=null",
150
+ "[END] success=true steps=1 rewards=0.50",
151
  ]
152
  return {
153
  "name": "structured_log_contract",
submission_tests/test_inference_contract.py CHANGED
@@ -6,16 +6,38 @@ import inference
6
 
7
 
8
  class InferenceContractTests(unittest.TestCase):
9
- def test_validate_submission_env_requires_hackathon_variables(self) -> None:
10
  with patch.dict("os.environ", {}, clear=True):
11
  with self.assertRaises(RuntimeError) as ctx:
12
  inference.validate_submission_env()
13
 
14
- self.assertIn("API_BASE_URL", str(ctx.exception))
15
- self.assertIn("MODEL_NAME", str(ctx.exception))
16
  self.assertIn("HF_TOKEN", str(ctx.exception))
17
 
18
- def test_validate_submission_env_uses_openai_client(self) -> None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  class DummyOpenAI:
20
  last_kwargs = None
21
 
@@ -26,20 +48,22 @@ class InferenceContractTests(unittest.TestCase):
26
  with patch.dict(
27
  "os.environ",
28
  {
29
- "API_BASE_URL": "https://router.huggingface.co/v1",
30
- "MODEL_NAME": "openai/gpt-4o-mini",
31
  "HF_TOKEN": "hf_test_token",
 
32
  },
33
  clear=True,
34
  ):
35
  config = inference.validate_submission_env()
36
 
37
- self.assertEqual(config["API_BASE_URL"], "https://router.huggingface.co/v1")
38
- self.assertEqual(config["MODEL_NAME"], "openai/gpt-4o-mini")
39
  self.assertEqual(config["HF_TOKEN"], "hf_test_token")
 
40
  self.assertEqual(
41
  DummyOpenAI.last_kwargs,
42
- {"base_url": "https://router.huggingface.co/v1", "api_key": "hf_test_token"},
43
  )
44
 
45
  def test_main_forces_submission_safe_defaults(self) -> None:
 
6
 
7
 
8
  class InferenceContractTests(unittest.TestCase):
9
+ def test_validate_submission_env_requires_hf_token(self) -> None:
10
  with patch.dict("os.environ", {}, clear=True):
11
  with self.assertRaises(RuntimeError) as ctx:
12
  inference.validate_submission_env()
13
 
 
 
14
  self.assertIn("HF_TOKEN", str(ctx.exception))
15
 
16
+ def test_validate_submission_env_uses_defaults_and_openai_client(self) -> None:
17
+ class DummyOpenAI:
18
+ last_kwargs = None
19
+
20
+ def __init__(self, **kwargs):
21
+ DummyOpenAI.last_kwargs = kwargs
22
+
23
+ with patch.object(inference, "OpenAI", DummyOpenAI):
24
+ with patch.dict(
25
+ "os.environ",
26
+ {"HF_TOKEN": "hf_test_token"},
27
+ clear=True,
28
+ ):
29
+ config = inference.validate_submission_env()
30
+
31
+ self.assertEqual(config["API_BASE_URL"], "https://generativelanguage.googleapis.com/v1beta/openai/")
32
+ self.assertEqual(config["MODEL_NAME"], "gemini-2.5-flash")
33
+ self.assertEqual(config["HF_TOKEN"], "hf_test_token")
34
+ self.assertIsNone(config["LOCAL_IMAGE_NAME"])
35
+ self.assertEqual(
36
+ DummyOpenAI.last_kwargs,
37
+ {"base_url": "https://generativelanguage.googleapis.com/v1beta/openai/", "api_key": "hf_test_token"},
38
+ )
39
+
40
+ def test_validate_submission_env_allows_explicit_overrides_and_local_image(self) -> None:
41
  class DummyOpenAI:
42
  last_kwargs = None
43
 
 
48
  with patch.dict(
49
  "os.environ",
50
  {
51
+ "API_BASE_URL": "https://example.invalid/v1",
52
+ "MODEL_NAME": "test-model",
53
  "HF_TOKEN": "hf_test_token",
54
+ "LOCAL_IMAGE_NAME": "local-env-image",
55
  },
56
  clear=True,
57
  ):
58
  config = inference.validate_submission_env()
59
 
60
+ self.assertEqual(config["API_BASE_URL"], "https://example.invalid/v1")
61
+ self.assertEqual(config["MODEL_NAME"], "test-model")
62
  self.assertEqual(config["HF_TOKEN"], "hf_test_token")
63
+ self.assertEqual(config["LOCAL_IMAGE_NAME"], "local-env-image")
64
  self.assertEqual(
65
  DummyOpenAI.last_kwargs,
66
+ {"base_url": "https://example.invalid/v1", "api_key": "hf_test_token"},
67
  )
68
 
69
  def test_main_forces_submission_safe_defaults(self) -> None: