Spaces:

h1manshu
/

code_review

Sleeping

App Files Files Community

h1manshu commited on 14 days ago

Commit

8684af9

verified ·

1 Parent(s): ea0c28f

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

README.md +2 -2
dataset/dataset.json +200 -0
inference.py +2 -4
openenv.yaml +50 -10

README.md CHANGED Viewed

@@ -165,7 +165,7 @@ Security vulnerabilities, injection attacks, and cross-file null-handling bugs.
 - For SQL: detect string concatenation and replace with a parameterized query (`%s` placeholder + `cursor.execute`).
 - For null bug: validate `id is not None` before the `db[id]` lookup, and fix the call site in `controller.py`.
-The agent runs `NUM_EPISODES = 4` episodes (configurable) with each `MAX_STEPS = 3` and logs each step:
 ```
 [START] task=code_review env=code_review_benchmark model=meta-llama/Llama-3.1-8B-Instruct
@@ -182,7 +182,7 @@ Key constants in `inference.py`:
 | Constant | Default | Description |
 |----------|---------|-------------|
 | `MAX_STEPS` | `3` | Steps per episode |
-| `NUM_EPISODES` | `4` | Number of PRs to review |
 | `TEMPERATURE` | `0.2` | Sampling temperature (lower = more deterministic) |
 | `MAX_TOKENS` | `256` | Max tokens per LLM response |
 | `SUCCESS_SCORE_THRESHOLD` | `0.1` | Minimum normalized score to count as success |

 - For SQL: detect string concatenation and replace with a parameterized query (`%s` placeholder + `cursor.execute`).
 - For null bug: validate `id is not None` before the `db[id]` lookup, and fix the call site in `controller.py`.
+The agent runs `NUM_EPISODES = 16` episodes (configurable) with each `MAX_STEPS = 3` and logs each step:
 ```
 [START] task=code_review env=code_review_benchmark model=meta-llama/Llama-3.1-8B-Instruct
 | Constant | Default | Description |
 |----------|---------|-------------|
 | `MAX_STEPS` | `3` | Steps per episode |
+| `NUM_EPISODES` | `16` | Number of PRs to review |
 | `TEMPERATURE` | `0.2` | Sampling temperature (lower = more deterministic) |
 | `MAX_TOKENS` | `256` | Max tokens per LLM response |
 | `SUCCESS_SCORE_THRESHOLD` | `0.1` | Minimum normalized score to count as success |

dataset/dataset.json CHANGED Viewed

@@ -19,6 +19,86 @@
       "fix": "from datetime import datetime\nprint(datetime.now())"
     }
   },
   {
     "task_type": "medium",
     "pr": {
@@ -59,6 +139,66 @@
       "fix": "return target in arr"
     }
   },
   {
     "task_type": "hard",
     "pr": {
@@ -122,5 +262,65 @@
       "decision": "reject",
       "fix": "def get_user(id):\n    if id is None:\n        raise ValueError('id must not be None')\n    return db[id]\n\nuser = get_user(user_id)"
     }
   }
 ]

       "fix": "from datetime import datetime\nprint(datetime.now())"
     }
   },
+  {
+    "task_type": "easy",
+    "pr": {
+      "id": "9",
+      "title": "Missing return statement",
+      "description": "Function does not return value",
+      "language": "python",
+      "diffs": [
+        {
+          "file_name": "utils.py",
+          "diff": "def add(a, b):\n    result = a + b"
+        }
+      ]
+    },
+    "ground_truth": {
+      "issues": ["missing return statement"],
+      "decision": "reject",
+      "fix": "def add(a, b):\n    result = a + b\n    return result"
+    }
+  },
+  {
+    "task_type": "easy",
+    "pr": {
+      "id": "10",
+      "title": "Wrong comparison operator",
+      "description": "Fix equality check",
+      "language": "python",
+      "diffs": [
+        {
+          "file_name": "check.py",
+          "diff": "if x = 10:\n    print('ten')"
+        }
+      ]
+    },
+    "ground_truth": {
+      "issues": ["assignment instead of comparison"],
+      "decision": "reject",
+      "fix": "if x == 10:\n    print('ten')"
+    }
+  },
+  {
+    "task_type": "easy",
+    "pr": {
+      "id": "11",
+      "title": "Undefined variable",
+      "description": "Variable used before assignment",
+      "language": "python",
+      "diffs": [
+        {
+          "file_name": "app.py",
+          "diff": "def greet():\n    print(message)\n    message = 'Hello'"
+        }
+      ]
+    },
+    "ground_truth": {
+      "issues": ["undefined variable", "variable used before assignment"],
+      "decision": "reject",
+      "fix": "def greet():\n    message = 'Hello'\n    print(message)"
+    }
+  },
+  {
+    "task_type": "easy",
+    "pr": {
+      "id": "12",
+      "title": "Clean utility function",
+      "description": "Simple string helper",
+      "language": "python",
+      "diffs": [
+        {
+          "file_name": "utils.py",
+          "diff": "def to_upper(s):\n    return s.upper()"
+        }
+      ]
+    },
+    "ground_truth": {
+      "issues": [],
+      "decision": "approve",
+      "fix": ""
+    }
+  },
   {
     "task_type": "medium",
     "pr": {
       "fix": "return target in arr"
     }
   },
+  {
+    "task_type": "medium",
+    "pr": {
+      "id": "13",
+      "title": "Mutable default argument",
+      "description": "Function with default list argument",
+      "language": "python",
+      "diffs": [
+        {
+          "file_name": "helper.py",
+          "diff": "def append_item(item, lst=[]):\n    lst.append(item)\n    return lst"
+        }
+      ]
+    },
+    "ground_truth": {
+      "issues": ["mutable default argument"],
+      "decision": "reject",
+      "fix": "def append_item(item, lst=None):\n    if lst is None:\n        lst = []\n    lst.append(item)\n    return lst"
+    }
+  },
+  {
+    "task_type": "medium",
+    "pr": {
+      "id": "14",
+      "title": "Unhandled exception",
+      "description": "File read without error handling",
+      "language": "python",
+      "diffs": [
+        {
+          "file_name": "reader.py",
+          "diff": "def read_file(path):\n    with open(path) as f:\n        return f.read()"
+        }
+      ]
+    },
+    "ground_truth": {
+      "issues": ["unhandled exception", "missing error handling"],
+      "decision": "reject",
+      "fix": "def read_file(path):\n    try:\n        with open(path) as f:\n            return f.read()\n    except FileNotFoundError:\n        return None"
+    }
+  },
+  {
+    "task_type": "medium",
+    "pr": {
+      "id": "15",
+      "title": "Integer overflow risk",
+      "description": "Large number multiplication",
+      "language": "python",
+      "diffs": [
+        {
+          "file_name": "compute.py",
+          "diff": "def factorial(n):\n    result = 1\n    for i in range(1, n+1):\n        result *= i\n    return result"
+        }
+      ]
+    },
+    "ground_truth": {
+      "issues": ["missing input validation"],
+      "decision": "reject",
+      "fix": "def factorial(n):\n    if n < 0:\n        raise ValueError('n must be non-negative')\n    result = 1\n    for i in range(1, n+1):\n        result *= i\n    return result"
+    }
+  },
   {
     "task_type": "hard",
     "pr": {
       "decision": "reject",
       "fix": "def get_user(id):\n    if id is None:\n        raise ValueError('id must not be None')\n    return db[id]\n\nuser = get_user(user_id)"
     }
+  },
+  {
+    "task_type": "hard",
+    "pr": {
+      "id": "16",
+      "title": "Race condition in counter",
+      "description": "Shared counter increment",
+      "language": "python",
+      "diffs": [
+        {
+          "file_name": "counter.py",
+          "diff": "counter = 0\n\ndef increment():\n    global counter\n    counter += 1"
+        }
+      ]
+    },
+    "ground_truth": {
+      "issues": ["race condition", "thread safety"],
+      "decision": "reject",
+      "fix": "import threading\n\ncounter = 0\nlock = threading.Lock()\n\ndef increment():\n    global counter\n    with lock:\n        counter += 1"
+    }
+  },
+  {
+    "task_type": "hard",
+    "pr": {
+      "id": "17",
+      "title": "Insecure deserialization",
+      "description": "Load user data from request",
+      "language": "python",
+      "diffs": [
+        {
+          "file_name": "api.py",
+          "diff": "import pickle\n\ndef load_user(data):\n    return pickle.loads(data)"
+        }
+      ]
+    },
+    "ground_truth": {
+      "issues": ["insecure deserialization", "security vulnerability"],
+      "decision": "reject",
+      "fix": "import json\n\ndef load_user(data):\n    return json.loads(data)"
+    }
+  },
+  {
+    "task_type": "hard",
+    "pr": {
+      "id": "18",
+      "title": "Path traversal vulnerability",
+      "description": "Serve user requested files",
+      "language": "python",
+      "diffs": [
+        {
+          "file_name": "files.py",
+          "diff": "def read_file(filename):\n    with open('/var/data/' + filename) as f:\n        return f.read()"
+        }
+      ]
+    },
+    "ground_truth": {
+      "issues": ["path traversal", "security vulnerability"],
+      "decision": "reject",
+      "fix": "import os\n\ndef read_file(filename):\n    base = '/var/data/'\n    full_path = os.path.realpath(os.path.join(base, filename))\n    if not full_path.startswith(base):\n        raise ValueError('Invalid file path')\n    with open(full_path) as f:\n        return f.read()"
+    }
   }
 ]

inference.py CHANGED Viewed

@@ -34,7 +34,7 @@ BENCHMARK = "code_review_benchmark"
 MAX_STEPS = 3
 TEMPERATURE = 0.2
 MAX_TOKENS = 256
-NUM_EPISODES = 6
 _MAX_REWARD_PER_STEP = MAX_TOKENS * 0.1
 MAX_TOTAL_REWARD = NUM_EPISODES * MAX_STEPS * _MAX_REWARD_PER_STEP
 SUCCESS_SCORE_THRESHOLD = 0.1  # normalized score in [0, 1]
@@ -235,7 +235,7 @@ async def run_episode(client, env):
         action_str = action_dict.get("action_type", "unknown")
         log_step(step=step, action=action_str, reward=reward, done=done, error=None)
         final_score = max(final_score, reward if reward else 0.0)
     return final_score
@@ -250,8 +250,6 @@ async def main():
         for i in range(NUM_EPISODES):
             task_name = f"task_{i+1}"
-            # START log must use task id from openenv.yaml
             log_start(task=task_name, env=BENCHMARK, model=MODEL_NAME)
             env.task_index = i

 MAX_STEPS = 3
 TEMPERATURE = 0.2
 MAX_TOKENS = 256
+NUM_EPISODES = 16
 _MAX_REWARD_PER_STEP = MAX_TOKENS * 0.1
 MAX_TOTAL_REWARD = NUM_EPISODES * MAX_STEPS * _MAX_REWARD_PER_STEP
 SUCCESS_SCORE_THRESHOLD = 0.1  # normalized score in [0, 1]
         action_str = action_dict.get("action_type", "unknown")
         log_step(step=step, action=action_str, reward=reward, done=done, error=None)
         final_score = max(final_score, reward if reward else 0.0)
     return final_score
         for i in range(NUM_EPISODES):
             task_name = f"task_{i+1}"
             log_start(task=task_name, env=BENCHMARK, model=MODEL_NAME)
             env.task_index = i

openenv.yaml CHANGED Viewed

@@ -8,27 +8,67 @@ tasks:
   - id: task_1
     description: "Easy — missing import detection"
     max_steps: 3
-    grader: graders:CodeReviewGrader
   - id: task_2
-    description: "Medium — division by zero handling"
     max_steps: 3
-    grader: graders:CodeReviewGrader
   - id: task_3
-    description: "Medium — inefficient loop optimization"
     max_steps: 3
-    grader: graders:CodeReviewGrader
   - id: task_4
-    description: "Hard — hardcoded password security vulnerability"
     max_steps: 3
-    grader: graders:CodeReviewGrader
   - id: task_5
-    description: "Hard — SQL injection vulnerability"
     max_steps: 3
-    grader: graders:CodeReviewGrader
   - id: task_6
     description: "Hard — cross-file null handling bug"
     max_steps: 3
-    grader: graders:CodeReviewGrader
 endpoints:
   reset: /reset
   step: /step

   - id: task_1
     description: "Easy — missing import detection"
     max_steps: 3
+    grader: graders:EasyGrader
   - id: task_2
+    description: "Easy — missing return statement"
     max_steps: 3
+    grader: graders:EasyGrader
   - id: task_3
+    description: "Easy — wrong comparison operator"
     max_steps: 3
+    grader: graders:EasyGrader
   - id: task_4
+    description: "Easy — undefined variable"
     max_steps: 3
+    grader: graders:EasyGrader
   - id: task_5
+    description: "Easy — clean utility function"
     max_steps: 3
+    grader: graders:EasyGrader
   - id: task_6
+    description: "Medium — division by zero handling"
+    max_steps: 3
+    grader: graders:MediumGrader
+  - id: task_7
+    description: "Medium — inefficient loop optimization"
+    max_steps: 3
+    grader: graders:MediumGrader
+  - id: task_8
+    description: "Medium — mutable default argument"
+    max_steps: 3
+    grader: graders:MediumGrader
+  - id: task_9
+    description: "Medium — unhandled exception"
+    max_steps: 3
+    grader: graders:MediumGrader
+  - id: task_10
+    description: "Medium — missing input validation"
+    max_steps: 3
+    grader: graders:MediumGrader
+  - id: task_11
+    description: "Hard — hardcoded password security vulnerability"
+    max_steps: 3
+    grader: graders:HardGrader
+  - id: task_12
+    description: "Hard — SQL injection vulnerability"
+    max_steps: 3
+    grader: graders:HardGrader
+  - id: task_13
     description: "Hard — cross-file null handling bug"
     max_steps: 3
+    grader: graders:HardGrader
+  - id: task_14
+    description: "Hard — race condition in counter"
+    max_steps: 3
+    grader: graders:HardGrader
+  - id: task_15
+    description: "Hard — insecure deserialization"
+    max_steps: 3
+    grader: graders:HardGrader
+  - id: task_16
+    description: "Hard — path traversal vulnerability"
+    max_steps: 3
+    grader: graders:HardGrader
 endpoints:
   reset: /reset
   step: /step