ajaxwin commited on
Commit
2171069
·
1 Parent(s): 41a051f

bug fixes in inference.py

Browse files
Files changed (3) hide show
  1. .gitignore +2 -1
  2. inference.py +47 -53
  3. org_inference.py +0 -449
.gitignore CHANGED
@@ -13,4 +13,5 @@ baseline_scores.json
13
  .pytest_cache/
14
  MySolution.md
15
  nltk_data
16
- eval_results.json
 
 
13
  .pytest_cache/
14
  MySolution.md
15
  nltk_data
16
+ eval_results.json
17
+ groq.py
inference.py CHANGED
@@ -3,13 +3,12 @@ inference.py
3
  ------------
4
  Inference script — Smart Contract Audit RL Environment.
5
 
6
- Implements agents for all three tasks using the OpenAI-compatible client.
7
  Emits mandatory structured stdout in the OpenEnv format.
8
 
9
  MANDATORY ENV VARS:
10
- API_BASE_URL LLM API endpoint (default: https://api.openai.com/v1)
11
- MODEL_NAME Model identifier (default: gpt-4o-mini)
12
- HF_TOKEN API key / HF token
13
 
14
  MANDATORY STDOUT FORMAT (per episode):
15
  [START] task=<id> env=smart-contract-audit model=<model>
@@ -49,6 +48,8 @@ HF_TOKEN = os.getenv("HF_TOKEN", "")
49
  if not HF_TOKEN:
50
  raise RuntimeError("HF_TOKEN environment variable not set")
51
 
 
 
52
  # Benchmark / environment identifier (constant for this env)
53
  ENV_BENCHMARK = "smart-contract-audit"
54
 
@@ -58,13 +59,31 @@ SEED_BASE = 42
58
 
59
  # Max steps per task
60
  MAX_STEPS_T1 = 15
61
- MAX_STEPS_T2 = 10
62
- MAX_STEPS_T3 = 12
63
 
64
  # A grader_score >= this is considered a "success" for the [END] line
65
  SUCCESS_SCORE_THRESHOLD = 0.5
66
 
67
- client = OpenAI(api_key=HF_TOKEN, base_url=API_BASE_URL)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
 
70
  # ─────────────────────────────────────────────────────────────────────────────
@@ -113,9 +132,6 @@ def log_end(
113
 
114
  def _t1_user_msg(obs: Dict[str, Any]) -> str:
115
  return (
116
- f"Contract: {obs['contract_name']}\n"
117
- f"Description: {obs['contract_description']}\n"
118
- f"Step: {obs['step_count']} | Reward so far: {obs['cumulative_reward']:.2f}\n\n"
119
  f"Last action : {obs['last_action'] or 'None'}\n"
120
  f"Last result : {obs['last_action_result'] or 'Episode just started.'}"
121
  )
@@ -126,9 +142,9 @@ def _run_t1_episode(env: Task1Environment, seed: int, ep_num: int) -> Dict[str,
126
  r = env.reset(seed=seed)
127
  obs = r.observation.model_dump()
128
 
129
- log_start(task="task1_vuln_detection", env=ENV_BENCHMARK, model=MODEL_NAME) # type: ignore
130
 
131
- messages: List[ChatCompletionMessageParam] = [ # type: ignore
132
  {"role": "system", "content": T1_SYSTEM}
133
  ]
134
  step_rewards: List[float] = []
@@ -140,11 +156,7 @@ def _run_t1_episode(env: Task1Environment, seed: int, ep_num: int) -> Dict[str,
140
  for step in range(1, MAX_STEPS_T1 + 1):
141
  messages.append({"role": "user", "content": _t1_user_msg(obs)})
142
  try:
143
- resp = client.chat.completions.create(
144
- model=MODEL_NAME, messages=messages, # type: ignore
145
- max_tokens=200, temperature=0.0,
146
- )
147
- raw = resp.choices[0].message.content.strip() # type: ignore
148
  error_msg = None
149
  except Exception as e:
150
  raw = ""
@@ -169,8 +181,7 @@ def _run_t1_episode(env: Task1Environment, seed: int, ep_num: int) -> Dict[str,
169
  log_step(step=step, action=at.value, reward=r_val, done=done, error=error_msg)
170
 
171
  if done:
172
- v = r_val
173
- grader_score = 1.0 if v >= 4.9 else (0.5 if v >= 0.9 else 0.0)
174
  break
175
 
176
  time.sleep(0.3)
@@ -196,10 +207,8 @@ def _run_t1_episode(env: Task1Environment, seed: int, ep_num: int) -> Dict[str,
196
  def _t2_user_msg(obs: Dict[str, Any]) -> str:
197
  extra = obs.get("extra", {})
198
  return (
199
- f"Contract : {obs['contract_name']}\n"
200
- f"Function : {extra.get('target_function', '?')} "
201
- f"({extra.get('target_signature', '')})\n"
202
- f"Step: {obs['step_count']} | Reward so far: {obs['cumulative_reward']:.2f}\n\n"
203
  f"Last action : {obs['last_action'] or 'None'}\n"
204
  f"Last result :\n{obs['last_action_result'] or 'Episode just started.'}"
205
  )
@@ -211,9 +220,9 @@ def _run_t2_episode(env: Task2Environment, seed: int, ep_num: int) -> Dict[str,
211
  obs = r.observation.model_dump()
212
  fn = obs["extra"].get("target_function", "?")
213
 
214
- log_start(task="task2_property_discovery", env=ENV_BENCHMARK, model=MODEL_NAME) # type: ignore
215
 
216
- messages: List[ChatCompletionMessageParam] = [ # type: ignore
217
  {"role": "system", "content": T2_SYSTEM}
218
  ]
219
  step_rewards: List[float] = []
@@ -225,11 +234,7 @@ def _run_t2_episode(env: Task2Environment, seed: int, ep_num: int) -> Dict[str,
225
  for step in range(1, MAX_STEPS_T2 + 1):
226
  messages.append({"role": "user", "content": _t2_user_msg(obs)})
227
  try:
228
- resp = client.chat.completions.create(
229
- model=MODEL_NAME, messages=messages, # type: ignore
230
- max_tokens=400, temperature=0.0,
231
- )
232
- raw = resp.choices[0].message.content.strip() # type: ignore
233
  error_msg = None
234
  except Exception as e:
235
  raw = ""
@@ -254,7 +259,7 @@ def _run_t2_episode(env: Task2Environment, seed: int, ep_num: int) -> Dict[str,
254
  log_step(step=step, action=at.value, reward=r_val, done=done, error=error_msg)
255
 
256
  if done:
257
- grader_score = round(r_val / 5.0, 3) if r_val > 0 else 0.0
258
  break
259
 
260
  time.sleep(0.3)
@@ -281,9 +286,7 @@ def _run_t2_episode(env: Task2Environment, seed: int, ep_num: int) -> Dict[str,
281
  def _t3_user_msg(obs: Dict[str, Any]) -> str:
282
  extra = obs.get("extra", {})
283
  return (
284
- f"Contract : {obs['contract_name']}\n"
285
- f"Property : {extra.get('property_english', '(none)')}\n"
286
- f"Step: {obs['step_count']} | Reward so far: {obs['cumulative_reward']:.2f}\n\n"
287
  f"Last action : {obs['last_action'] or 'None'}\n"
288
  f"Last result :\n{obs['last_action_result'] or 'Episode just started.'}"
289
  )
@@ -294,9 +297,9 @@ def _run_t3_episode(env: Task3Environment, seed: int, ep_num: int) -> Dict[str,
294
  r = env.reset(seed=seed)
295
  obs = r.observation.model_dump()
296
 
297
- log_start(task="task3_rule_checker", env=ENV_BENCHMARK, model=MODEL_NAME) # type: ignore
298
 
299
- messages: List[ChatCompletionMessageParam] = [ # type: ignore
300
  {"role": "system", "content": T3_SYSTEM}
301
  ]
302
  step_rewards: List[float] = []
@@ -308,11 +311,7 @@ def _run_t3_episode(env: Task3Environment, seed: int, ep_num: int) -> Dict[str,
308
  for step in range(1, MAX_STEPS_T3 + 1):
309
  messages.append({"role": "user", "content": _t3_user_msg(obs)})
310
  try:
311
- resp = client.chat.completions.create(
312
- model=MODEL_NAME, messages=messages, # type: ignore
313
- max_tokens=200, temperature=0.0,
314
- )
315
- raw = resp.choices[0].message.content.strip() # type: ignore
316
  error_msg = None
317
  except Exception as e:
318
  raw = ""
@@ -337,8 +336,7 @@ def _run_t3_episode(env: Task3Environment, seed: int, ep_num: int) -> Dict[str,
337
  log_step(step=step, action=at.value, reward=r_val, done=done, error=error_msg)
338
 
339
  if done:
340
- v = r_val
341
- grader_score = 1.0 if v >= 4.9 else (0.3 if v >= 1.0 else 0.0)
342
  break
343
 
344
  time.sleep(0.3)
@@ -367,13 +365,11 @@ def run_task1(n: int = NUM_EPISODES) -> Dict[str, Any]:
367
  env = Task1Environment()
368
  episodes = [_run_t1_episode(env, SEED_BASE + i, i + 1) for i in range(n)]
369
  avg_s = sum(e["grader_score"] for e in episodes) / n
370
- avg_r = sum(e["cumulative_reward"] for e in episodes) / n
371
  print(f"\n Avg grader score : {avg_s:.3f}", flush=True)
372
- print(f" Avg cum reward : {avg_r:.2f}", flush=True)
373
  return {
374
  "task_id": "task1_vuln_detection", "name": "Targeted Vulnerability Detection",
375
  "status": "active", "num_episodes": n, "episodes": episodes,
376
- "avg_grader_score": avg_s, "avg_cumulative_reward": avg_r,
377
  }
378
 
379
 
@@ -386,11 +382,10 @@ def run_task2(n: int = NUM_EPISODES) -> Dict[str, Any]:
386
  avg_s = sum(e["grader_score"] for e in episodes) / n
387
  avg_r = sum(e["cumulative_reward"] for e in episodes) / n
388
  print(f"\n Avg grader score : {avg_s:.3f}", flush=True)
389
- print(f" Avg cum reward : {avg_r:.2f}", flush=True)
390
  return {
391
  "task_id": "task2_property_discovery", "name": "Property Discovery",
392
  "status": "active", "num_episodes": n, "episodes": episodes,
393
- "avg_grader_score": avg_s, "avg_cumulative_reward": avg_r,
394
  }
395
 
396
 
@@ -403,11 +398,10 @@ def run_task3(n: int = NUM_EPISODES) -> Dict[str, Any]:
403
  avg_s = sum(e["grader_score"] for e in episodes) / n
404
  avg_r = sum(e["cumulative_reward"] for e in episodes) / n
405
  print(f"\n Avg grader score : {avg_s:.3f}", flush=True)
406
- print(f" Avg cum reward : {avg_r:.2f}", flush=True)
407
  return {
408
  "task_id": "task3_rule_checker", "name": "Rule Checker",
409
  "status": "active", "num_episodes": n, "episodes": episodes,
410
- "avg_grader_score": avg_s, "avg_cumulative_reward": avg_r,
411
  }
412
 
413
 
@@ -418,7 +412,7 @@ def run_task3(n: int = NUM_EPISODES) -> Dict[str, Any]:
418
  async def main() -> None:
419
  """Async entry point (wraps sync env calls; asyncio.run() expected by caller)."""
420
  print("Smart Contract Audit RL Environment — Baseline Inference", flush=True)
421
- print(f"Model: {MODEL_NAME} | Base URL: {API_BASE_URL}", flush=True)
422
 
423
  t1 = run_task1(NUM_EPISODES)
424
  t2 = run_task2(NUM_EPISODES)
@@ -426,7 +420,7 @@ async def main() -> None:
426
 
427
  results = {
428
  "model": MODEL_NAME,
429
- "base_url": API_BASE_URL,
430
  "tasks": [t1, t2, t3],
431
  }
432
  overall = sum(t["avg_grader_score"] for t in results["tasks"]) / 3
@@ -445,4 +439,4 @@ async def main() -> None:
445
 
446
 
447
  if __name__ == "__main__":
448
- asyncio.run(main())
 
3
  ------------
4
  Inference script — Smart Contract Audit RL Environment.
5
 
6
+ Implements agents for all three tasks using the Groq client.
7
  Emits mandatory structured stdout in the OpenEnv format.
8
 
9
  MANDATORY ENV VARS:
10
+ GROQ_API_KEY Groq API key (required)
11
+ MODEL_NAME Model identifier (default: openai/gpt-oss-20b)
 
12
 
13
  MANDATORY STDOUT FORMAT (per episode):
14
  [START] task=<id> env=smart-contract-audit model=<model>
 
48
  if not HF_TOKEN:
49
  raise RuntimeError("HF_TOKEN environment variable not set")
50
 
51
+ client = OpenAI(api_key=HF_TOKEN, base_url=API_BASE_URL)
52
+
53
  # Benchmark / environment identifier (constant for this env)
54
  ENV_BENCHMARK = "smart-contract-audit"
55
 
 
59
 
60
  # Max steps per task
61
  MAX_STEPS_T1 = 15
62
+ MAX_STEPS_T2 = 15
63
+ MAX_STEPS_T3 = 15
64
 
65
  # A grader_score >= this is considered a "success" for the [END] line
66
  SUCCESS_SCORE_THRESHOLD = 0.5
67
 
68
+ # ─────────────────────────────────────────────────────────────────────────────
69
+ # Unified LLM call function
70
+ # ─────────────────────────────────────────────────────────────────────────────
71
+
72
+ def get_llm_response(
73
+ messages: List[Dict[str, str]],
74
+ max_tokens: int = 200,
75
+ temperature: float = 0.0,
76
+ ) -> str:
77
+ """
78
+ Call the Groq LLM with the given messages and parameters.
79
+ Returns the response content as a string.
80
+ Raises an exception on failure (to be caught by the caller).
81
+ """
82
+ completion = client.chat.completions.create(
83
+ model=MODEL_NAME,
84
+ messages=messages, # type: ignore
85
+ )
86
+ return completion.choices[0].message.content.strip() # type: ignore
87
 
88
 
89
  # ─────────────────────────────────────────────────────────────────────────────
 
132
 
133
  def _t1_user_msg(obs: Dict[str, Any]) -> str:
134
  return (
 
 
 
135
  f"Last action : {obs['last_action'] or 'None'}\n"
136
  f"Last result : {obs['last_action_result'] or 'Episode just started.'}"
137
  )
 
142
  r = env.reset(seed=seed)
143
  obs = r.observation.model_dump()
144
 
145
+ log_start(task="task1_vuln_detection", env=ENV_BENCHMARK, model=MODEL_NAME)
146
 
147
+ messages: List[Dict[str, str]] = [
148
  {"role": "system", "content": T1_SYSTEM}
149
  ]
150
  step_rewards: List[float] = []
 
156
  for step in range(1, MAX_STEPS_T1 + 1):
157
  messages.append({"role": "user", "content": _t1_user_msg(obs)})
158
  try:
159
+ raw = get_llm_response(messages, max_tokens=200, temperature=0.0)
 
 
 
 
160
  error_msg = None
161
  except Exception as e:
162
  raw = ""
 
181
  log_step(step=step, action=at.value, reward=r_val, done=done, error=error_msg)
182
 
183
  if done:
184
+ grader_score = r_val
 
185
  break
186
 
187
  time.sleep(0.3)
 
207
  def _t2_user_msg(obs: Dict[str, Any]) -> str:
208
  extra = obs.get("extra", {})
209
  return (
210
+ f"Target Function : {extra.get('target_function', '?')} "
211
+ # f"({extra.get('target_signature', '')})\n"
 
 
212
  f"Last action : {obs['last_action'] or 'None'}\n"
213
  f"Last result :\n{obs['last_action_result'] or 'Episode just started.'}"
214
  )
 
220
  obs = r.observation.model_dump()
221
  fn = obs["extra"].get("target_function", "?")
222
 
223
+ log_start(task="task2_property_discovery", env=ENV_BENCHMARK, model=MODEL_NAME)
224
 
225
+ messages: List[Dict[str, str]] = [
226
  {"role": "system", "content": T2_SYSTEM}
227
  ]
228
  step_rewards: List[float] = []
 
234
  for step in range(1, MAX_STEPS_T2 + 1):
235
  messages.append({"role": "user", "content": _t2_user_msg(obs)})
236
  try:
237
+ raw = get_llm_response(messages, max_tokens=400, temperature=0.0)
 
 
 
 
238
  error_msg = None
239
  except Exception as e:
240
  raw = ""
 
259
  log_step(step=step, action=at.value, reward=r_val, done=done, error=error_msg)
260
 
261
  if done:
262
+ grader_score = r_val
263
  break
264
 
265
  time.sleep(0.3)
 
286
  def _t3_user_msg(obs: Dict[str, Any]) -> str:
287
  extra = obs.get("extra", {})
288
  return (
289
+ f"Verify Property : {extra.get('property_english', '(none)')}\n"
 
 
290
  f"Last action : {obs['last_action'] or 'None'}\n"
291
  f"Last result :\n{obs['last_action_result'] or 'Episode just started.'}"
292
  )
 
297
  r = env.reset(seed=seed)
298
  obs = r.observation.model_dump()
299
 
300
+ log_start(task="task3_rule_checker", env=ENV_BENCHMARK, model=MODEL_NAME)
301
 
302
+ messages: List[Dict[str, str]] = [
303
  {"role": "system", "content": T3_SYSTEM}
304
  ]
305
  step_rewards: List[float] = []
 
311
  for step in range(1, MAX_STEPS_T3 + 1):
312
  messages.append({"role": "user", "content": _t3_user_msg(obs)})
313
  try:
314
+ raw = get_llm_response(messages, max_tokens=200, temperature=0.0)
 
 
 
 
315
  error_msg = None
316
  except Exception as e:
317
  raw = ""
 
336
  log_step(step=step, action=at.value, reward=r_val, done=done, error=error_msg)
337
 
338
  if done:
339
+ grader_score = r_val
 
340
  break
341
 
342
  time.sleep(0.3)
 
365
  env = Task1Environment()
366
  episodes = [_run_t1_episode(env, SEED_BASE + i, i + 1) for i in range(n)]
367
  avg_s = sum(e["grader_score"] for e in episodes) / n
 
368
  print(f"\n Avg grader score : {avg_s:.3f}", flush=True)
 
369
  return {
370
  "task_id": "task1_vuln_detection", "name": "Targeted Vulnerability Detection",
371
  "status": "active", "num_episodes": n, "episodes": episodes,
372
+ "avg_grader_score": avg_s
373
  }
374
 
375
 
 
382
  avg_s = sum(e["grader_score"] for e in episodes) / n
383
  avg_r = sum(e["cumulative_reward"] for e in episodes) / n
384
  print(f"\n Avg grader score : {avg_s:.3f}", flush=True)
 
385
  return {
386
  "task_id": "task2_property_discovery", "name": "Property Discovery",
387
  "status": "active", "num_episodes": n, "episodes": episodes,
388
+ "avg_grader_score": avg_s
389
  }
390
 
391
 
 
398
  avg_s = sum(e["grader_score"] for e in episodes) / n
399
  avg_r = sum(e["cumulative_reward"] for e in episodes) / n
400
  print(f"\n Avg grader score : {avg_s:.3f}", flush=True)
 
401
  return {
402
  "task_id": "task3_rule_checker", "name": "Rule Checker",
403
  "status": "active", "num_episodes": n, "episodes": episodes,
404
+ "avg_grader_score": avg_s
405
  }
406
 
407
 
 
412
  async def main() -> None:
413
  """Async entry point (wraps sync env calls; asyncio.run() expected by caller)."""
414
  print("Smart Contract Audit RL Environment — Baseline Inference", flush=True)
415
+ print(f"Model: {MODEL_NAME} | Groq API", flush=True)
416
 
417
  t1 = run_task1(NUM_EPISODES)
418
  t2 = run_task2(NUM_EPISODES)
 
420
 
421
  results = {
422
  "model": MODEL_NAME,
423
+ "backend": "groq",
424
  "tasks": [t1, t2, t3],
425
  }
426
  overall = sum(t["avg_grader_score"] for t in results["tasks"]) / 3
 
439
 
440
 
441
  if __name__ == "__main__":
442
+ asyncio.run(main())
org_inference.py DELETED
@@ -1,449 +0,0 @@
1
- """
2
- inference.py
3
- ------------
4
- Inference script — Smart Contract Audit RL Environment.
5
-
6
- Implements agents for all three tasks using the OpenAI-compatible client.
7
- Emits mandatory structured stdout in the OpenEnv format.
8
-
9
- MANDATORY ENV VARS:
10
- API_BASE_URL LLM API endpoint (default: https://api.openai.com/v1)
11
- MODEL_NAME Model identifier (default: gpt-4o-mini)
12
- HF_TOKEN API key / HF token
13
-
14
- MANDATORY STDOUT FORMAT (per episode):
15
- [START] task=<id> env=smart-contract-audit model=<model>
16
- [STEP] step=<n> action=<str> reward=<0.00> done=<true|false> error=<str|null>
17
- [END] success=<true|false> steps=<n> score=<0.000> rewards=<r1,r2,...>
18
-
19
- Usage:
20
- python inference.py
21
-
22
- Output:
23
- Structured stdout per episode, plus baseline_scores.json summary.
24
- """
25
-
26
- import asyncio
27
- import json
28
- import os
29
- import sys
30
- import time
31
- from typing import Any, Dict, List, Optional
32
-
33
- from openai import OpenAI
34
-
35
- from server import Task1Environment, Task2Environment, Task3Environment
36
- from env.schemas import Action, ActionType
37
- from utils import T1_SYSTEM, T2_SYSTEM, T3_SYSTEM
38
- from dotenv import dotenv_values
39
-
40
- # ─────────────────────────────────────────────────────────────────────────────
41
- # Configuration
42
- # ─────────────────────────────────────────────────────────────────────────────
43
-
44
- config = dotenv_values(".env")
45
- API_BASE_URL = config.get("API_BASE_URL", "https://api.openai.com/v1")
46
- MODEL_NAME = config.get("MODEL_NAME", "gpt-4o")
47
- HF_TOKEN = config.get("HF_TOKEN", "")
48
-
49
- if not HF_TOKEN:
50
- print("[WARN] HF_TOKEN not set — API calls may fail.", file=sys.stderr)
51
- exit(1)
52
-
53
- # Benchmark / environment identifier (constant for this env)
54
- ENV_BENCHMARK = "smart-contract-audit"
55
-
56
- # Episodes per task
57
- NUM_EPISODES = 3
58
- SEED_BASE = 42
59
-
60
- # Max steps per task
61
- MAX_STEPS_T1 = 15
62
- MAX_STEPS_T2 = 10
63
- MAX_STEPS_T3 = 12
64
-
65
- # A grader_score >= this is considered a "success" for the [END] line
66
- SUCCESS_SCORE_THRESHOLD = 0.5
67
-
68
- client = OpenAI(api_key=HF_TOKEN, base_url=API_BASE_URL)
69
-
70
-
71
- # ─────────────────────────────────────────────────────────────────────────────
72
- # Mandatory stdout helpers
73
- # ─────────────────────────────────────────────────────────────────────────────
74
-
75
- def log_start(task: str, env: str, model: str) -> None:
76
- """Emit the [START] line — one per episode."""
77
- print(f"[START] task={task} env={env} model={model}", flush=True)
78
-
79
-
80
- def log_step(
81
- step: int,
82
- action: str,
83
- reward: float,
84
- done: bool,
85
- error: Optional[str] = None,
86
- ) -> None:
87
- """Emit a [STEP] line — one per env.step() call."""
88
- error_val = error if error else "null"
89
- print(
90
- f"[STEP] step={step} action={action} "
91
- f"reward={reward:.2f} done={str(done).lower()} error={error_val}",
92
- flush=True,
93
- )
94
-
95
-
96
- def log_end(
97
- success: bool,
98
- steps: int,
99
- score: float,
100
- rewards: List[float],
101
- ) -> None:
102
- """Emit the [END] line — one per episode, always emitted."""
103
- rewards_str = ",".join(f"{r:.2f}" for r in rewards)
104
- print(
105
- f"[END] success={str(success).lower()} steps={steps} "
106
- f"score={score:.3f} rewards={rewards_str}",
107
- flush=True,
108
- )
109
-
110
-
111
- # ─────────────────────────────────────────────────────────────────────────────
112
- # Task 1 — Targeted Vulnerability Detection
113
- # ─────────────────────────────────────────────────────────────────────────────
114
-
115
- def _t1_user_msg(obs: Dict[str, Any]) -> str:
116
- return (
117
- f"Contract: {obs['contract_name']}\n"
118
- f"Description: {obs['contract_description']}\n"
119
- f"Step: {obs['step_count']} | Reward so far: {obs['cumulative_reward']:.2f}\n\n"
120
- f"Last action : {obs['last_action'] or 'None'}\n"
121
- f"Last result : {obs['last_action_result'] or 'Episode just started.'}"
122
- )
123
-
124
-
125
- def _run_t1_episode(env: Task1Environment, seed: int, ep_num: int) -> Dict[str, Any]:
126
- """Run one Task 1 episode; emit [START]/[STEP]/[END]."""
127
- r = env.reset(seed=seed)
128
- obs = r.observation.model_dump()
129
-
130
- log_start(task="task1_vuln_detection", env=ENV_BENCHMARK, model=MODEL_NAME) # type: ignore
131
-
132
- messages: List[ChatCompletionMessageParam] = [ # type: ignore
133
- {"role": "system", "content": T1_SYSTEM}
134
- ]
135
- step_rewards: List[float] = []
136
- grader_score = 0.0
137
- steps_taken = 0
138
- error_msg: Optional[str] = None
139
-
140
- try:
141
- for step in range(1, MAX_STEPS_T1 + 1):
142
- messages.append({"role": "user", "content": _t1_user_msg(obs)})
143
- try:
144
- resp = client.chat.completions.create(
145
- model=MODEL_NAME, messages=messages, # type: ignore
146
- max_tokens=200, temperature=0.0,
147
- )
148
- raw = resp.choices[0].message.content.strip() # type: ignore
149
- error_msg = None
150
- except Exception as e:
151
- raw = ""
152
- error_msg = str(e)[:80]
153
- print(f"[DEBUG] T1 LLM error ep={ep_num} step={step}: {e}", file=sys.stderr)
154
-
155
- try:
156
- parsed = json.loads(raw)
157
- at = ActionType(parsed["action"])
158
- params = parsed.get("params", {})
159
- except Exception:
160
- at, params = ActionType.LIST_FUNCTIONS, {}
161
-
162
- messages.append({"role": "assistant", "content": raw})
163
- result = env.step(Action(action_type=at, params=params))
164
- obs = result.observation.model_dump()
165
- r_val = result.reward.value
166
- done = result.done
167
-
168
- step_rewards.append(r_val)
169
- steps_taken = step
170
- log_step(step=step, action=at.value, reward=r_val, done=done, error=error_msg)
171
-
172
- if done:
173
- v = r_val
174
- grader_score = 1.0 if v >= 4.9 else (0.5 if v >= 0.9 else 0.0)
175
- break
176
-
177
- time.sleep(0.3)
178
-
179
- finally:
180
- success = grader_score >= SUCCESS_SCORE_THRESHOLD
181
- log_end(success=success, steps=steps_taken, score=grader_score, rewards=step_rewards)
182
-
183
- return {
184
- "episode": ep_num,
185
- "seed": seed,
186
- "contract": obs["contract_name"],
187
- "grader_score": grader_score,
188
- "cumulative_reward": obs["cumulative_reward"],
189
- }
190
-
191
-
192
- # ─────────────────────────────────────────────────────────────────────────────
193
- # Task 2 — Property Discovery
194
- # ─────────────────────────────────────────────────────────────────────────────
195
-
196
-
197
- def _t2_user_msg(obs: Dict[str, Any]) -> str:
198
- extra = obs.get("extra", {})
199
- return (
200
- f"Contract : {obs['contract_name']}\n"
201
- f"Function : {extra.get('target_function', '?')} "
202
- f"({extra.get('target_signature', '')})\n"
203
- f"Step: {obs['step_count']} | Reward so far: {obs['cumulative_reward']:.2f}\n\n"
204
- f"Last action : {obs['last_action'] or 'None'}\n"
205
- f"Last result :\n{obs['last_action_result'] or 'Episode just started.'}"
206
- )
207
-
208
-
209
- def _run_t2_episode(env: Task2Environment, seed: int, ep_num: int) -> Dict[str, Any]:
210
- """Run one Task 2 episode; emit [START]/[STEP]/[END]."""
211
- r = env.reset(seed=seed)
212
- obs = r.observation.model_dump()
213
- fn = obs["extra"].get("target_function", "?")
214
-
215
- log_start(task="task2_property_discovery", env=ENV_BENCHMARK, model=MODEL_NAME) # type: ignore
216
-
217
- messages: List[ChatCompletionMessageParam] = [ # type: ignore
218
- {"role": "system", "content": T2_SYSTEM}
219
- ]
220
- step_rewards: List[float] = []
221
- grader_score = 0.0
222
- steps_taken = 0
223
- error_msg: Optional[str] = None
224
-
225
- try:
226
- for step in range(1, MAX_STEPS_T2 + 1):
227
- messages.append({"role": "user", "content": _t2_user_msg(obs)})
228
- try:
229
- resp = client.chat.completions.create(
230
- model=MODEL_NAME, messages=messages, # type: ignore
231
- max_tokens=400, temperature=0.0,
232
- )
233
- raw = resp.choices[0].message.content.strip() # type: ignore
234
- error_msg = None
235
- except Exception as e:
236
- raw = ""
237
- error_msg = str(e)[:80]
238
- print(f"[DEBUG] T2 LLM error ep={ep_num} step={step}: {e}", file=sys.stderr)
239
-
240
- try:
241
- parsed = json.loads(raw)
242
- at = ActionType(parsed["action"])
243
- params = parsed.get("params", {})
244
- except Exception:
245
- at, params = ActionType.GET_FUNCTION_CODE, {}
246
-
247
- messages.append({"role": "assistant", "content": raw})
248
- result = env.step(Action(action_type=at, params=params))
249
- obs = result.observation.model_dump()
250
- r_val = result.reward.value
251
- done = result.done
252
-
253
- step_rewards.append(r_val)
254
- steps_taken = step
255
- log_step(step=step, action=at.value, reward=r_val, done=done, error=error_msg)
256
-
257
- if done:
258
- grader_score = round(r_val / 5.0, 3) if r_val > 0 else 0.0
259
- break
260
-
261
- time.sleep(0.3)
262
-
263
- finally:
264
- success = grader_score >= SUCCESS_SCORE_THRESHOLD
265
- log_end(success=success, steps=steps_taken, score=grader_score, rewards=step_rewards)
266
-
267
- return {
268
- "episode": ep_num,
269
- "seed": seed,
270
- "contract": obs["contract_name"],
271
- "function": fn,
272
- "grader_score": grader_score,
273
- "cumulative_reward": obs["cumulative_reward"],
274
- }
275
-
276
-
277
- # ─────────────────────────────────────────────────────────────────────────────
278
- # Task 3 — Rule Checker
279
- # ─────────────────────────────────────────────────────────────────────────────
280
-
281
-
282
- def _t3_user_msg(obs: Dict[str, Any]) -> str:
283
- extra = obs.get("extra", {})
284
- return (
285
- f"Contract : {obs['contract_name']}\n"
286
- f"Property : {extra.get('property_english', '(none)')}\n"
287
- f"Step: {obs['step_count']} | Reward so far: {obs['cumulative_reward']:.2f}\n\n"
288
- f"Last action : {obs['last_action'] or 'None'}\n"
289
- f"Last result :\n{obs['last_action_result'] or 'Episode just started.'}"
290
- )
291
-
292
-
293
- def _run_t3_episode(env: Task3Environment, seed: int, ep_num: int) -> Dict[str, Any]:
294
- """Run one Task 3 episode; emit [START]/[STEP]/[END]."""
295
- r = env.reset(seed=seed)
296
- obs = r.observation.model_dump()
297
-
298
- log_start(task="task3_rule_checker", env=ENV_BENCHMARK, model=MODEL_NAME) # type: ignore
299
-
300
- messages: List[ChatCompletionMessageParam] = [ # type: ignore
301
- {"role": "system", "content": T3_SYSTEM}
302
- ]
303
- step_rewards: List[float] = []
304
- grader_score = 0.0
305
- steps_taken = 0
306
- error_msg: Optional[str] = None
307
-
308
- try:
309
- for step in range(1, MAX_STEPS_T3 + 1):
310
- messages.append({"role": "user", "content": _t3_user_msg(obs)})
311
- try:
312
- resp = client.chat.completions.create(
313
- model=MODEL_NAME, messages=messages, # type: ignore
314
- max_tokens=200, temperature=0.0,
315
- )
316
- raw = resp.choices[0].message.content.strip() # type: ignore
317
- error_msg = None
318
- except Exception as e:
319
- raw = ""
320
- error_msg = str(e)[:80]
321
- print(f"[DEBUG] T3 LLM error ep={ep_num} step={step}: {e}", file=sys.stderr)
322
-
323
- try:
324
- parsed = json.loads(raw)
325
- at = ActionType(parsed["action"])
326
- params = parsed.get("params", {})
327
- except Exception:
328
- at, params = ActionType.LIST_FUNCTIONS, {}
329
-
330
- messages.append({"role": "assistant", "content": raw})
331
- result = env.step(Action(action_type=at, params=params))
332
- obs = result.observation.model_dump()
333
- r_val = result.reward.value
334
- done = result.done
335
-
336
- step_rewards.append(r_val)
337
- steps_taken = step
338
- log_step(step=step, action=at.value, reward=r_val, done=done, error=error_msg)
339
-
340
- if done:
341
- v = r_val
342
- grader_score = 1.0 if v >= 4.9 else (0.3 if v >= 1.0 else 0.0)
343
- break
344
-
345
- time.sleep(0.3)
346
-
347
- finally:
348
- success = grader_score >= SUCCESS_SCORE_THRESHOLD
349
- log_end(success=success, steps=steps_taken, score=grader_score, rewards=step_rewards)
350
-
351
- return {
352
- "episode": ep_num,
353
- "seed": seed,
354
- "contract": obs["contract_name"],
355
- "grader_score": grader_score,
356
- "cumulative_reward": obs["cumulative_reward"],
357
- }
358
-
359
-
360
- # ─────────────────────────────────────────────────────────────────────────────
361
- # Task runners
362
- # ─────────────────────────────────────────────────────────────────────────────
363
-
364
- def run_task1(n: int = NUM_EPISODES) -> Dict[str, Any]:
365
- print("\n" + "="*60, flush=True)
366
- print("TASK 1: Targeted Vulnerability Detection", flush=True)
367
- print("="*60, flush=True)
368
- env = Task1Environment()
369
- episodes = [_run_t1_episode(env, SEED_BASE + i, i + 1) for i in range(n)]
370
- avg_s = sum(e["grader_score"] for e in episodes) / n
371
- avg_r = sum(e["cumulative_reward"] for e in episodes) / n
372
- print(f"\n Avg grader score : {avg_s:.3f}", flush=True)
373
- print(f" Avg cum reward : {avg_r:.2f}", flush=True)
374
- return {
375
- "task_id": "task1_vuln_detection", "name": "Targeted Vulnerability Detection",
376
- "status": "active", "num_episodes": n, "episodes": episodes,
377
- "avg_grader_score": avg_s, "avg_cumulative_reward": avg_r,
378
- }
379
-
380
-
381
- def run_task2(n: int = NUM_EPISODES) -> Dict[str, Any]:
382
- print("\n" + "="*60, flush=True)
383
- print("TASK 2: Property Discovery", flush=True)
384
- print("="*60, flush=True)
385
- env = Task2Environment()
386
- episodes = [_run_t2_episode(env, SEED_BASE + i, i + 1) for i in range(n)]
387
- avg_s = sum(e["grader_score"] for e in episodes) / n
388
- avg_r = sum(e["cumulative_reward"] for e in episodes) / n
389
- print(f"\n Avg grader score : {avg_s:.3f}", flush=True)
390
- print(f" Avg cum reward : {avg_r:.2f}", flush=True)
391
- return {
392
- "task_id": "task2_property_discovery", "name": "Property Discovery",
393
- "status": "active", "num_episodes": n, "episodes": episodes,
394
- "avg_grader_score": avg_s, "avg_cumulative_reward": avg_r,
395
- }
396
-
397
-
398
- def run_task3(n: int = NUM_EPISODES) -> Dict[str, Any]:
399
- print("\n" + "="*60, flush=True)
400
- print("TASK 3: Rule Checker", flush=True)
401
- print("="*60, flush=True)
402
- env = Task3Environment()
403
- episodes = [_run_t3_episode(env, SEED_BASE + i, i + 1) for i in range(n)]
404
- avg_s = sum(e["grader_score"] for e in episodes) / n
405
- avg_r = sum(e["cumulative_reward"] for e in episodes) / n
406
- print(f"\n Avg grader score : {avg_s:.3f}", flush=True)
407
- print(f" Avg cum reward : {avg_r:.2f}", flush=True)
408
- return {
409
- "task_id": "task3_rule_checker", "name": "Rule Checker",
410
- "status": "active", "num_episodes": n, "episodes": episodes,
411
- "avg_grader_score": avg_s, "avg_cumulative_reward": avg_r,
412
- }
413
-
414
-
415
- # ─────────────────────────────────────────────────────────────────────────────
416
- # Main
417
- # ─────────────────────────────────────────────────────────────────────────────
418
-
419
- async def main() -> None:
420
- """Async entry point (wraps sync env calls; asyncio.run() expected by caller)."""
421
- print("Smart Contract Audit RL Environment — Baseline Inference", flush=True)
422
- print(f"Model: {MODEL_NAME} | Base URL: {API_BASE_URL}", flush=True)
423
-
424
- t1 = run_task1(NUM_EPISODES)
425
- t2 = run_task2(NUM_EPISODES)
426
- t3 = run_task3(NUM_EPISODES)
427
-
428
- results = {
429
- "model": MODEL_NAME,
430
- "base_url": API_BASE_URL,
431
- "tasks": [t1, t2, t3],
432
- }
433
- overall = sum(t["avg_grader_score"] for t in results["tasks"]) / 3
434
- results["overall_avg_score"] = overall
435
-
436
- print("\n" + "="*60, flush=True)
437
- print("BASELINE SUMMARY", flush=True)
438
- print("="*60, flush=True)
439
- for t in results["tasks"]:
440
- print(f" ✅ {t['name']:40s}: {t['avg_grader_score']:.3f}", flush=True)
441
- print(f"\n Overall avg grader score: {overall:.3f}", flush=True)
442
-
443
- with open("baseline_scores.json", "w") as f:
444
- json.dump(results, f, indent=2)
445
- print("\n Scores written to baseline_scores.json", flush=True)
446
-
447
-
448
- if __name__ == "__main__":
449
- asyncio.run(main())