Spaces:

Codex47
/

SmartContractAudit

Running

App Files Files Community

ajaxwin commited on 7 days ago

Commit

5235476

1 Parent(s): 409c8b7

refactor: Update ActionType to include costs and modified grader for task 1

Browse files

Files changed (6) hide show

env/schemas.py +33 -25
server/app.py +5 -9
server/tasks/task1/actions.py +35 -62
server/tasks/task1/environment.py +18 -28
server/tasks/task1/grader.py +15 -43
utils/semanticmatcher.py +2 -14

env/schemas.py CHANGED Viewed

@@ -24,28 +24,43 @@ from pydantic import BaseModel, Field
 # ---------------------------------------------------------------------------
 class ActionType(str, Enum):
     # ── Task 1 – Vulnerability Detection ───────────────────────────────────
-    LIST_FUNCTIONS       = "list_functions"
-    GET_FUNCTION_CODE    = "get_function_code"
-    GET_FUNCTION_SUMMARY = "get_function_summary"
-    GET_FILE_METADATA    = "get_file_metadata"
-    GET_STATE_VARIABLE   = "get_state_variable"
-    GET_CALL_GRAPH       = "get_call_graph"
-    SUBMIT               = "submit"
     # ── Task 2 – Property Discovery ─────────────────────────────────────────
-    GET_SIMILAR_RULE      = "get_similar_rule"       # -0.20
-    GET_FILE_NATSPEC      = "get_file_natspec"        # -0.03
-    GET_FUNCTION_NATSPEC  = "get_function_natspec"    # -0.08
-    GET_RELATED_FUNCTIONS = "get_related_functions"   # -0.06
-    GET_SIGNATURE         = "get_signature"           # -0.04
-    SUBMIT_PROPERTY       = "submit_property"         # scored 0–5, one attempt
     # ── Task 3 – Rule Checker ────────────────────────────────────────────────
-    GET_PROPERTY_SPECIFICATION = "get_property_specification"  # -0.03
-    GET_FUNCTION_METADATA   = "get_function_metadata"    # -0.05
-    SUBMIT_FUNCTION         = "submit_function"          # +5.0 / +1.5 / -1.5, one attempt
 class Action(BaseModel):
     """
@@ -73,27 +88,20 @@ class Observation(BaseModel):
     task_id              : which task is active
     contract_name        : name of the Solidity contract
-    contract_description : high-level description of what the contract does
     available_actions    : list of valid ActionType strings
     last_action          : the action that produced this observation (None on reset)
     last_action_result   : human-readable result of the last action
-    step_count           : number of steps taken so far
-    cumulative_reward    : running reward total
     done                 : whether the episode has ended
     extra                : any additional task-specific context
     """
     task_id: str
     contract_name: str
-    contract_description: str
-    available_actions: List[str]
     last_action: Optional[str] = None
     last_action_result: Optional[str] = None
-    step_count: int = 0
-    cumulative_reward: float = 0.0
     done: bool = False
     extra: Dict[str, Any] = Field(default_factory=dict)
 # ---------------------------------------------------------------------------
 # Reward
 # ---------------------------------------------------------------------------

 # ---------------------------------------------------------------------------
 class ActionType(str, Enum):
+    """(Action type, cost)"""
+    # Attribute to store the cost of each action
+    cost: float
     # ── Task 1 – Vulnerability Detection ───────────────────────────────────
+    LIST_FUNCTIONS       = ("list_functions", -0.04)
+    GET_FUNCTION_CODE    = ("get_function_code", -0.14)
+    GET_FUNCTION_SUMMARY = ("get_function_summary", -0.07)
+    GET_FILE_METADATA    = ("get_file_metadata", -0.02)
+    GET_STATE_VARIABLE   = ("get_state_variable", -0.06)
+    GET_CALL_GRAPH       = ("get_call_graph", -0.08)
+    SUBMIT               = ("submit", 0.0)
     # ── Task 2 – Property Discovery ─────────────────────────────────────────
+    GET_SIMILAR_RULE      = ("get_similar_rule", 0.0)
+    GET_FILE_NATSPEC      = ("get_file_natspec", 0.0)
+    GET_FUNCTION_NATSPEC  = ("get_function_natspec", 0.0)
+    GET_RELATED_FUNCTIONS = ("get_related_functions", 0.0)
+    GET_SIGNATURE         = ("get_signature", 0.0)
+    SUBMIT_PROPERTY       = ("submit_property", 0.0)
     # ── Task 3 – Rule Checker ────────────────────────────────────────────────
+    GET_PROPERTY_SPECIFICATION = ("get_property_specification", 0.0)
+    GET_FUNCTION_METADATA   = ("get_function_metadata", 0.0)
+    SUBMIT_FUNCTION         = ("submit_function", 0.0)
+    # ─────── General Actions ─────────────────────────────────────────────────
+    UNKNOWN                = ("unknown", 0.0)
+    REPEATED               = ("repeated", -0.22)
+    RESUBMIT               = ("resubmit", 0.0)
+    def __new__(cls, str_value: str, cost: float):
+        obj = str.__new__(cls, str_value)
+        obj._value_ = str_value
+        obj.cost = cost
+        return obj
 class Action(BaseModel):
     """
     task_id              : which task is active
     contract_name        : name of the Solidity contract
     available_actions    : list of valid ActionType strings
     last_action          : the action that produced this observation (None on reset)
     last_action_result   : human-readable result of the last action
     done                 : whether the episode has ended
     extra                : any additional task-specific context
     """
     task_id: str
     contract_name: str
+    # available_actions: List[str]       # May need it, may not depends on the agent
     last_action: Optional[str] = None
     last_action_result: Optional[str] = None
     done: bool = False
     extra: Dict[str, Any] = Field(default_factory=dict)
 # ---------------------------------------------------------------------------
 # Reward
 # ---------------------------------------------------------------------------

server/app.py CHANGED Viewed

@@ -190,18 +190,15 @@ def step(
             status_code=400,
             detail=f"No active session '{session_id}'. Call /reset first.",
         )
-    try:
-        action_type = ActionType(body.action_type)
-    except ValueError:
-        raise HTTPException(
-            status_code=400,
-            detail=f"Unknown action_type '{body.action_type}'. Valid: {[a.value for a in ActionType]}",
-        )
     action = Action(action_type=action_type, params=body.params)
     try:
         result = env.step(action)
     except RuntimeError as e:
-        raise HTTPException(status_code=409, detail=str(e))
     return JSONResponse(content=result.model_dump(), status_code=200)
@@ -216,7 +213,6 @@ def state(session_id: str = Query(default=DEFAULT_SESSION)):
         )
     return JSONResponse(content=env.state().model_dump(), status_code=200)
 @app.get("/action_space")
 def action_space(task_id: str = "task1_vuln_detection"):
     """Describe the action space for a task."""

             status_code=400,
             detail=f"No active session '{session_id}'. Call /reset first.",
         )
+    # removed error handling here
+    action_type = ActionType(body.action_type) if body.action_type in ActionType else ActionType.UNKNOWN
     action = Action(action_type=action_type, params=body.params)
     try:
         result = env.step(action)
     except RuntimeError as e:
+        return JSONResponse(content=str(e), status_code = 200)
     return JSONResponse(content=result.model_dump(), status_code=200)
         )
     return JSONResponse(content=env.state().model_dump(), status_code=200)
 @app.get("/action_space")
 def action_space(task_id: str = "task1_vuln_detection"):
     """Describe the action space for a task."""

server/tasks/task1/actions.py CHANGED Viewed

@@ -1,13 +1,8 @@
 """Actions for Task 1: Targeted Vulnerability Detection.
-    Actions & rewards:
-    list_functions       -0.05  (broad overview of contract)
-    get_function_code    -0.10 (wrong function) / +0.05 (correct function)
-    get_function_summary -0.05 (wrong function) / +0.03 (correct function)
-    get_file_metadata    -0.04  (general contract info)
 """
 from typing import Any, Dict, Tuple
-from env.schemas import Reward
 from data.data_loader import (
     list_function_names,
     get_function_by_name,
@@ -19,11 +14,11 @@ from data.data_loader import (
 def list_functions(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
     """Handle LIST_FUNCTIONS action."""
     if ctx._is_repeated(qkey):
-        return "Repeated query.", Reward(value=-0.40, reason="Repeated query", partial=True)
     names = list_function_names(ctx._contract)
     return (
         f"Functions in {ctx._contract['contract_name']}: {', '.join(names)}",
-        Reward(value=-0.05, reason="list_functions cost", partial=True),
     )
@@ -31,20 +26,19 @@ def get_function_code(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
     """Handle GET_FUNCTION_CODE action."""
     fn_name = params.get("function_name", "")
     if ctx._is_repeated(qkey):
-        return "Repeated query.", Reward(value=-0.40, reason="Repeated query", partial=True)
     fn = get_function_by_name(ctx._contract, fn_name)
     if fn is None:
         return (
             f"Function '{fn_name}' not found. Available: {list_function_names(ctx._contract)}",
-            Reward(value=-0.10, reason="Wrong/unknown function name", partial=True),
         )
-    is_target = fn["name"].lower() == ctx._target_fn["name"].lower()
     code = fn.get("code", "// no code available")
-    reward_val = 0.05 if is_target else -0.10
-    reason = "Fetched target function code (+)" if is_target else "Fetched non-target function (-)"
     return (
         f"// {fn['name']}\n{code}",
-        Reward(value=reward_val, reason=reason, partial=True),
     )
@@ -52,71 +46,74 @@ def get_function_summary(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward
     """Handle GET_FUNCTION_SUMMARY action."""
     fn_name = params.get("function_name", "")
     if ctx._is_repeated(qkey):
-        return "Repeated query.", Reward(value=-0.40, reason="Repeated query", partial=True)
     fn = get_function_by_name(ctx._contract, fn_name)
     if fn is None:
         return (
             f"Function '{fn_name}' not found.",
-            Reward(value=-0.05, reason="Wrong function name", partial=True),
         )
-    is_target = fn["name"].lower() == ctx._target_fn["name"].lower()
     comment = fn.get("comment", "No summary available.")
-    reward_val = 0.03 if is_target else -0.05
-    reason = "Fetched target function summary (+)" if is_target else "Fetched non-target summary (-)"
     return (
         f"Summary of '{fn['name']}': {comment}",
-        Reward(value=reward_val, reason=reason, partial=True),
     )
 def get_file_metadata(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
     """Handle GET_FILE_METADATA action."""
     if ctx._is_repeated(qkey):
-        return "Repeated query.", Reward(value=-0.40, reason="Repeated query", partial=True)
     meta = ctx._contract.get("metadata", {})
     result = (
         f"Contract: {ctx._contract['contract_name']} | "
         f"Solidity: {meta.get('solidity_version', 'N/A')} | "
         f"Description: {meta.get('description', 'N/A')}"
     )
-    return result, Reward(value=-0.04, reason="get_file_metadata cost", partial=True)
 def get_state_variable(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
     """Handle GET_STATE_VARIABLE action."""
     var_name = params.get("variable_name", "")
     if ctx._is_repeated(qkey):
-        return "Repeated query.", Reward(value=-0.40, reason="Repeated query", partial=True)
     if not var_name:
         names = list_state_variable_names(ctx._contract)
         return (
             f"State variables: {', '.join(names)}",
-            Reward(value=-0.05, reason="Listed state variables", partial=True),
         )
     sv = get_state_variable_by_name(ctx._contract, var_name)
     if sv is None:
         return (
             f"Variable '{var_name}' not found.",
-            Reward(value=-0.05, reason="Unknown state variable", partial=True),
         )
     return (
         f"{sv['type']} {sv['visibility']} {sv['name']}: {sv.get('description', '')}",
-        Reward(value=-0.05, reason="get_state_variable cost", partial=True),
     )
 def get_call_graph(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
     """Handle GET_CALL_GRAPH action."""
     if ctx._is_repeated(qkey):
-        return "Repeated query.", Reward(value=-0.40, reason="Repeated query", partial=True)
     cg = ctx._contract.get("call_graph", {})
     cg_str = "; ".join(f"{fn} → [{', '.join(callees)}]" for fn, callees in cg.items())
     return (
         f"Call graph: {cg_str}",
-        Reward(value=-0.08, reason="get_call_graph cost", partial=True),
     )
 def submit(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
     """Handle SUBMIT action for Task 1.
@@ -127,14 +124,15 @@ def submit(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
     """
     if ctx._done:
         return (
-            "❌ You have already submitted for this episode. "
             "Only ONE submission is allowed.",
-            Reward(value=0.0, reason="Second submit_function attempt", partial=False),
         )
     fn_name   = params.get("function_name", "").strip()
     vuln_type = params.get("vulnerability_type", "").strip()
     if not fn_name or not vuln_type:
         return (
             "submit_function requires both 'function_name' and "
@@ -142,35 +140,10 @@ def submit(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
             Reward(value=0.0, reason="Malformed submission", partial=False),
         )
-    ctx._done      = True
-    score      = ctx._grader.grade_submission(fn_name, vuln_type)   # {0.0, 0.5, 1.0}
-    reward_val = ctx._grader.reward_for_score(score)                 # [0.0, 1.0]
-    correct    = ctx._grader.get_canonical_answer()
-    if score == 1.0:
-        msg = (
-            f"✅ CORRECT! '{fn_name}' is the vulnerable function "
-            f"and the vulnerability type matches. "
-            f"Score: 1.0 → Reward: {reward_val:.3f}"
-        )
-    elif score == 0.5:
-        msg = (
-            f"🟡 PARTIAL. '{fn_name}' is the correct function but the "
-            f"vulnerability type was not recognised. "
-            f"Score: 0.5 → Reward: {reward_val:.3f}. "
-            f"Expected vulnerability: '{correct['vulnerability']}'."
-        )
-    else:
-        msg = (
-            f"❌ INCORRECT. '{fn_name}' is not the target function. "
-            f"Score: 0.0 → Reward: {reward_val:.3f}. "
-            f"Correct answer: function='{correct['function']}', "
-            f"vulnerability='{correct['vulnerability']}'."
-        )
-    return msg, Reward(
-        value=reward_val,
         reason=f"submit_function score={score:.1f}",
         partial=False,
     )
@@ -180,5 +153,5 @@ def unknown_action(ctx: Any, qkey: str, params: Dict, action_type: str) -> Tuple
     """Fallback for unknown actions."""
     return (
         f"Unknown action type: {action_type}",
-        Reward(value=-0.10, reason="Unknown action", partial=True),
     )

 """Actions for Task 1: Targeted Vulnerability Detection.
 """
 from typing import Any, Dict, Tuple
+from env.schemas import ActionType, Reward
 from data.data_loader import (
     list_function_names,
     get_function_by_name,
 def list_functions(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
     """Handle LIST_FUNCTIONS action."""
     if ctx._is_repeated(qkey):
+        return "Repeated query.", Reward(value=ActionType.REPEATED.cost, reason="Repeated query", partial=True)
     names = list_function_names(ctx._contract)
     return (
         f"Functions in {ctx._contract['contract_name']}: {', '.join(names)}",
+        Reward(value=ActionType.LIST_FUNCTIONS.cost, reason="list_functions cost", partial=True),
     )
     """Handle GET_FUNCTION_CODE action."""
     fn_name = params.get("function_name", "")
     if ctx._is_repeated(qkey):
+        return "Repeated query.", Reward(value=ActionType.REPEATED.cost, reason="Repeated query", partial=True)
     fn = get_function_by_name(ctx._contract, fn_name)
     if fn is None:
         return (
             f"Function '{fn_name}' not found. Available: {list_function_names(ctx._contract)}",
+            Reward(value=ActionType.GET_FUNCTION_CODE.cost, reason="Wrong/unknown function name", partial=True),
         )
     code = fn.get("code", "// no code available")
     return (
         f"// {fn['name']}\n{code}",
+        Reward(value=ActionType.GET_FUNCTION_CODE.cost, reason="Fetched code", partial=True),
     )
     """Handle GET_FUNCTION_SUMMARY action."""
     fn_name = params.get("function_name", "")
     if ctx._is_repeated(qkey):
+        return "Repeated query.", Reward(value=ActionType.REPEATED.cost, reason="Repeated query", partial=True)
     fn = get_function_by_name(ctx._contract, fn_name)
     if fn is None:
         return (
             f"Function '{fn_name}' not found.",
+            Reward(value=ActionType.GET_FUNCTION_SUMMARY.cost, reason="Wrong function name", partial=True),
         )
     comment = fn.get("comment", "No summary available.")
     return (
         f"Summary of '{fn['name']}': {comment}",
+        Reward(value=ActionType.GET_FUNCTION_SUMMARY.cost, reason="Fetched summary", partial=True),
     )
 def get_file_metadata(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
     """Handle GET_FILE_METADATA action."""
     if ctx._is_repeated(qkey):
+        return "Repeated query.", Reward(value=ActionType.REPEATED.cost, reason="Repeated query", partial=True)
     meta = ctx._contract.get("metadata", {})
     result = (
         f"Contract: {ctx._contract['contract_name']} | "
         f"Solidity: {meta.get('solidity_version', 'N/A')} | "
         f"Description: {meta.get('description', 'N/A')}"
     )
+    return result, Reward(value=ActionType.GET_FILE_METADATA.cost, reason="get_file_metadata cost", partial=True)
 def get_state_variable(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
     """Handle GET_STATE_VARIABLE action."""
     var_name = params.get("variable_name", "")
     if ctx._is_repeated(qkey):
+        return "Repeated query.", Reward(value=ActionType.REPEATED.cost, reason="Repeated query", partial=True)
     if not var_name:
         names = list_state_variable_names(ctx._contract)
         return (
             f"State variables: {', '.join(names)}",
+            Reward(value=ActionType.GET_STATE_VARIABLE.cost, reason="Listed state variables", partial=True),
         )
     sv = get_state_variable_by_name(ctx._contract, var_name)
     if sv is None:
         return (
             f"Variable '{var_name}' not found.",
+            Reward(value=ActionType.GET_STATE_VARIABLE.cost, reason="Unknown state variable", partial=True),
         )
     return (
         f"{sv['type']} {sv['visibility']} {sv['name']}: {sv.get('description', '')}",
+        Reward(value=ActionType.GET_STATE_VARIABLE.cost, reason="get_state_variable cost", partial=True),
     )
 def get_call_graph(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
     """Handle GET_CALL_GRAPH action."""
     if ctx._is_repeated(qkey):
+        return "Repeated query.", Reward(value=ActionType.REPEATED.cost, reason="Repeated query", partial=True)
     cg = ctx._contract.get("call_graph", {})
     cg_str = "; ".join(f"{fn} → [{', '.join(callees)}]" for fn, callees in cg.items())
     return (
         f"Call graph: {cg_str}",
+        Reward(value=ActionType.GET_CALL_GRAPH.cost, reason="get_call_graph cost", partial=True),
     )
 def submit(ctx: Any, qkey: str, params: Dict) -> Tuple[str, Reward]:
     """Handle SUBMIT action for Task 1.
     """
     if ctx._done:
         return (
             "Only ONE submission is allowed.",
+            Reward(value=ActionType.RESUBMIT.cost,
+                   reason="Second submit_function attempt",
+                   partial=False),
         )
     fn_name   = params.get("function_name", "").strip()
     vuln_type = params.get("vulnerability_type", "").strip()
     if not fn_name or not vuln_type:
         return (
             "submit_function requires both 'function_name' and "
             Reward(value=0.0, reason="Malformed submission", partial=False),
         )
+    ctx._done = True
+    score = ctx._grader.grade(fn_name, vuln_type, ctx._step_count, ctx._cummulative_cost)
+    return (f"Correct Answer: {ctx._grader.get_canonical_answer}"), Reward(
+        value=score,
         reason=f"submit_function score={score:.1f}",
         partial=False,
     )
     """Fallback for unknown actions."""
     return (
         f"Unknown action type: {action_type}",
+        Reward(value=ActionType.UNKNOWN.cost, reason="Unknown action", partial=True),
     )

server/tasks/task1/environment.py CHANGED Viewed

@@ -9,21 +9,11 @@ Episode flow:
   3. The agent uses actions to explore the contract (each costs a small penalty).
   4. When the agent submits, the Grader scores the answer and the episode ends.
-Reward shaping:
-  list_functions                            : -0.05
-  get_function_code                         : -0.10 (wrong function) / +0.05 (correct function)
-  get_function_summary                      : -0.05 (wrong function) / +0.03 (correct function)
-  get_file_metadata                         : -0.04
-  get_state_variable                        : -0.05
-  get_call_graph                            : -0.08
-  correct submit (score=1.0)                : +5.0
-  partially correct submit (score=0.5)      : +1.0
-  wrong submit (score=0.0)                  : -1.5
-  repeated query                            : -0.40
 """
 from __future__ import annotations
 import random
 from typing import Any, Dict, List, Optional, Set
@@ -60,16 +50,19 @@ class Task1Environment(BaseEnv):
     def __init__(self, contracts_path: Optional[str] = None) -> None:
         self._contracts = load_contracts(contracts_path) if contracts_path else load_contracts()
         self._rng = random.Random()
         # Episode state (initialised by reset)
         self._contract: Dict[str, Any] = {}
         self._target_fn: Dict[str, Any] = {}
         self._grader: Optional[Task1Grader] = None
         self._step_count: int = 0
-        self._cumulative_reward: float = 0.0
         self._done: bool = False
         self._query_history: List[str] = []
         self._seen_queries: Set[str] = set()
     # ------------------------------------------------------------------
     # OpenEnv interface
@@ -84,9 +77,10 @@ class Task1Environment(BaseEnv):
         self._grader = Task1Grader(
             target_function=self._target_fn["name"],
             vulnerability_issue=self._target_fn["vulnerability_details"]["issue"],
         )
         self._step_count = 0
-        self._cumulative_reward = 0.0
         self._done = False
         self._query_history = []
         self._seen_queries = set()
@@ -102,28 +96,28 @@ class Task1Environment(BaseEnv):
     def step(self, action: Action) -> StepResult:
         """Execute one agent action."""
         if self._done:
             raise RuntimeError("Episode is done. Call reset() to start a new episode.")
-        self._step_count += 1
-        # Dispatch
         result_text, reward = self._dispatch(action)
-        self._cumulative_reward += reward.value
-        self._query_history.append(f"[{action.action_type}] → {result_text[:120]}")
         obs = self._build_observation(
             last_action=action.action_type,
             last_result=result_text,
         )
         return StepResult(
             observation=obs,
             reward=reward,
             done=self._done,
             info={
                 "step": self._step_count,
-                "cumulative_reward": self._cumulative_reward,
             },
         )
@@ -133,7 +127,7 @@ class Task1Environment(BaseEnv):
             contract_name=self._contract.get("contract_name", ""),
             target_function=self._target_fn.get("name", ""),
             step_count=self._step_count,
-            cumulative_reward=self._cumulative_reward,
             done=self._done,
             query_history=list(self._query_history),
         )
@@ -150,12 +144,8 @@ class Task1Environment(BaseEnv):
         return Observation(
             task_id=TASK_ID,
             contract_name=self._contract.get("contract_name", ""),
-            contract_description=self._contract.get("metadata", {}).get("description", ""),
-            available_actions=[a.value for a in AVAILABLE_ACTIONS],
             last_action=last_action,
             last_action_result=last_result,
-            step_count=self._step_count,
-            cumulative_reward=self._cumulative_reward,
             done=self._done,
             extra={
                 "solidity_version": self._contract.get("metadata", {}).get("solidity_version", ""),
@@ -181,7 +171,7 @@ class Task1Environment(BaseEnv):
         at = action.action_type
         params = action.params
         qkey = self._query_key(at, params)
         # Mapping from ActionType to handler function
         handlers = {
             ActionType.LIST_FUNCTIONS:       actions.list_functions,

   3. The agent uses actions to explore the contract (each costs a small penalty).
   4. When the agent submits, the Grader scores the answer and the episode ends.
 """
 from __future__ import annotations
+from math import floor, log2
 import random
 from typing import Any, Dict, List, Optional, Set
     def __init__(self, contracts_path: Optional[str] = None) -> None:
         self._contracts = load_contracts(contracts_path) if contracts_path else load_contracts()
         self._rng = random.Random()
+        self._max_steps: int = 0
         # Episode state (initialised by reset)
         self._contract: Dict[str, Any] = {}
         self._target_fn: Dict[str, Any] = {}
         self._grader: Optional[Task1Grader] = None
         self._step_count: int = 0
+        self._cummulative_cost: float = 0.0
         self._done: bool = False
         self._query_history: List[str] = []
         self._seen_queries: Set[str] = set()
+        self._cost_free_steps: int = 0
+        self._decay: float = 0.0
     # ------------------------------------------------------------------
     # OpenEnv interface
         self._grader = Task1Grader(
             target_function=self._target_fn["name"],
             vulnerability_issue=self._target_fn["vulnerability_details"]["issue"],
+            n = floor(log2(len(self._contract["functions"])))
         )
         self._step_count = 0
+        self._cummulative_cost = 0.0
         self._done = False
         self._query_history = []
         self._seen_queries = set()
     def step(self, action: Action) -> StepResult:
         """Execute one agent action."""
         if self._done:
             raise RuntimeError("Episode is done. Call reset() to start a new episode.")
+        if self._step_count > self._max_steps:
+            raise RuntimeError("Exceeded maximum number of steps allowed. Call reset() to start a new episode.")
+        self._step_count += 1
         result_text, reward = self._dispatch(action)
+        self._cummulative_cost += reward.value
+        self._query_history.append(f"[{action.action_type}] → {result_text[:200]}")
         obs = self._build_observation(
             last_action=action.action_type,
             last_result=result_text,
         )
         return StepResult(
             observation=obs,
             reward=reward,
             done=self._done,
             info={
                 "step": self._step_count,
+                "cumulative_reward": self._cummulative_cost,
             },
         )
             contract_name=self._contract.get("contract_name", ""),
             target_function=self._target_fn.get("name", ""),
             step_count=self._step_count,
+            cumulative_reward=self._cummulative_cost,
             done=self._done,
             query_history=list(self._query_history),
         )
         return Observation(
             task_id=TASK_ID,
             contract_name=self._contract.get("contract_name", ""),
             last_action=last_action,
             last_action_result=last_result,
             done=self._done,
             extra={
                 "solidity_version": self._contract.get("metadata", {}).get("solidity_version", ""),
         at = action.action_type
         params = action.params
         qkey = self._query_key(at, params)
         # Mapping from ActionType to handler function
         handlers = {
             ActionType.LIST_FUNCTIONS:       actions.list_functions,

server/tasks/task1/grader.py CHANGED Viewed

@@ -1,58 +1,30 @@
 """
 grader.py  (Task 1 – Targeted Vulnerability Detection)
 -------------------------------------------------------
-Deterministic grader. Grade range: 0.0 – 1.0
-  1.0 – correct function + correct vulnerability keyword
-  0.5 – correct function + wrong/unrecognised vulnerability keyword
-  0.0 – wrong function name
-reward_for_score() normalises the raw RL reward to [0.0, 1.0]
-using the fixed reward bounds [MIN_REWARD=-1.5, MAX_REWARD=5.0]:
-  normalised = (raw + 1.5) / 6.5
 """
 from __future__ import annotations
 from typing import Dict
 from utils import SemanticMatcher
-# Raw reward bounds — used only for normalisation
-_MIN_REWARD = -1.5
-_MAX_REWARD =  5.0
-_REWARD_RANGE = _MAX_REWARD - _MIN_REWARD  # 6.5
-_SCORE_MIN = 0.001   # grades are strictly (0, 1)
-_SCORE_MAX = 0.999
-def _clamp(v: float) -> float:
-    return max(_SCORE_MIN, min(_SCORE_MAX, v))
 class Task1Grader:
-    def __init__(self, target_function: str, vulnerability_issue: str) -> None:
         self.target_function     = target_function.lower()
         self.vulnerability_issue = vulnerability_issue
-    def grade_submission(self, submitted_function: str, submitted_vuln_type: str) -> float:
         """Returns grade strictly in (0, 1)."""
-        if submitted_function.strip().lower() != self.target_function:
-            return _clamp(0.0)   # → 0.001
-        return _clamp(1.0) if SemanticMatcher().match(self.vulnerability_issue, submitted_vuln_type) else _clamp(0.5)
-    def reward_for_score(self, score: float) -> float:
-        """
-        Maps grade score → normalised reward strictly in (0, 1).
-        Raw rewards:  correct=+5.0, partial=+1.0, wrong=-1.5
-        Normalised:   (raw + 1.5) / 6.5  then clamped to (0.001, 0.999)
-        """
-        if score >= _SCORE_MAX:
-            raw = 5.0
-        elif score >= 0.5:
-            raw = 1.0
-        else:
-            raw = -1.5
-        return _clamp((raw - _MIN_REWARD) / _REWARD_RANGE)
     def get_canonical_answer(self) -> Dict[str, str]:
         return {"function": self.target_function, "vulnerability": self.vulnerability_issue}

 """
 grader.py  (Task 1 – Targeted Vulnerability Detection)
 -------------------------------------------------------
+Deterministic grader. Grade range: (0, 1)
 """
 from __future__ import annotations
 from typing import Dict
 from utils import SemanticMatcher
 class Task1Grader:
+    def __init__(self, target_function: str, vulnerability_issue: str, n: int) -> None:
         self.target_function     = target_function.lower()
         self.vulnerability_issue = vulnerability_issue
+        # Log of No. of functions (n) is a heurisitic used to decided the size of contract code
+        self.n = n
+        self._decay = 0.75
+    def grade(self, submitted_function: str, submitted_vuln_type: str, steps: int, cummulative_cost: int) -> float:
         """Returns grade strictly in (0, 1)."""
+        func_match = submitted_function.strip().lower() != self.target_function
+        issue_match = SemanticMatcher().match(self.vulnerability_issue, submitted_vuln_type)
+        # Score formula
+        free_budget = (cummulative_cost / steps) * (self.n + 2)
+        return func_match * issue_match * (self._decay ** max(0, cummulative_cost - free_budget))
     def get_canonical_answer(self) -> Dict[str, str]:
         return {"function": self.target_function, "vulnerability": self.vulnerability_issue}

utils/semanticmatcher.py CHANGED Viewed

@@ -142,18 +142,6 @@ def cosine_similarity(vec_a: np.ndarray, vec_b: np.ndarray) -> float:
         return 0.0
     return float(np.dot(vec_a, vec_b) / (norm_a * norm_b))
-# ── Score clamping ───────────────────────────────────────────────────────────
-_SCORE_MIN = 0.001   # scores are strictly (0, 1) — never touch 0 or 1
-_SCORE_MAX = 0.999
-def _clamp(score: float) -> float:
-    """Clamp score to the open interval (0, 1): [_SCORE_MIN, _SCORE_MAX]."""
-    return max(_SCORE_MIN, min(_SCORE_MAX, score))
 # ── Core matcher ──────────────────────────────────────────────────────────────
 class SemanticMatcher:
@@ -212,7 +200,7 @@ class SemanticMatcher:
         # Fast-path: normalized exact match
         if normalize(text_a) == normalize(text_b):
             self.confidence_level = "strong"
-            return _clamp(1.0)   # → 0.999  (strictly less than 1)
         tokens_a = tokenize_and_lemmatize(text_a)
         tokens_b = tokenize_and_lemmatize(text_b)
@@ -230,7 +218,7 @@ class SemanticMatcher:
             self.confidence_level = "moderate"
         else:
             self.confidence_level = "no_match"
-        return _clamp(score)   # strictly in (0, 1)
     def match(self, text_a: str, text_b: str) -> bool:
         """Return True if the two texts are considered a match based on the score."""

         return 0.0
     return float(np.dot(vec_a, vec_b) / (norm_a * norm_b))
 # ── Core matcher ──────────────────────────────────────────────────────────────
 class SemanticMatcher:
         # Fast-path: normalized exact match
         if normalize(text_a) == normalize(text_b):
             self.confidence_level = "strong"
+            return 1.0
         tokens_a = tokenize_and_lemmatize(text_a)
         tokens_b = tokenize_and_lemmatize(text_b)
             self.confidence_level = "moderate"
         else:
             self.confidence_level = "no_match"
+        return score
     def match(self, text_a: str, text_b: str) -> bool:
         """Return True if the two texts are considered a match based on the score."""