Spaces:

h1manshu
/

code_review

Sleeping

App Files Files Community

h1manshu commited on 9 days ago

Commit

215184f

verified ·

1 Parent(s): a6de3cf

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

server/code_review_environment.py +57 -229
server/graders.py +353 -0

server/code_review_environment.py CHANGED Viewed

@@ -7,8 +7,8 @@
 """
 Code Review Environment Implementation.
-A simple test environment that echoes back messages sent to it.
-Perfect for testing HTTP server infrastructure.
 """
 from uuid import uuid4
@@ -35,93 +35,65 @@ except ImportError:
 import json
 from pathlib import Path
-import re
-from difflib import SequenceMatcher
-dataset_path = Path(__file__).parent.parent / "dataset" / "dataset.json"
-STOP_WORDS = {
-    "use",
-    "the",
-    "a",
-    "an",
-    "to",
-    "and",
-    "or",
-    "of",
-    "in",
-    "for",
-    "with",
-    "is",
-    "it",
-    "on",
-    "at",
-    "by",
-    "from",
-    "that",
-}
 class CodeReviewEnvironment(Environment):
     """
-    A simple echo environment that echoes back messages.
-    This environment is designed for testing the HTTP server infrastructure.
-    It maintains minimal state and simply echoes back whatever message it receives.
     Example:
-        >>> env = CodeReviewEnvironment()
         >>> obs = env.reset()
-        >>> print(obs.echoed_message)  # "Code Review environment ready!"
-        >>>
-        >>> obs = env.step(CodeReviewAction(message="Hello"))
-        >>> print(obs.echoed_message)  # "Hello"
-        >>> print(obs.message_length)  # 5
     """
-    # Enable concurrent WebSocket sessions.
-    # Set to True if your environment isolates state between instances.
-    # When True, multiple WebSocket clients can connect simultaneously, each
-    # getting their own environment instance (when using factory mode in app.py).
     SUPPORTS_CONCURRENT_SESSIONS: bool = True
-    def __init__(self):
-        """Initialize the code_review environment."""
         self._state = State(episode_id=str(uuid4()), step_count=0)
         self._reset_count = 0
         self.max_steps = 5
         self.task_index = 0
         with open(dataset_path) as f:
             self.dataset = json.load(f)
         self.reset()
     def reset(self) -> CodeReviewObservation:
-        """
-        Reset the environment.
-        Returns:
-            CodeReviewObservation with a ready message
-        """
         self._state = State(episode_id=str(uuid4()), step_count=0)
         self._reset_count += 1
         self.task_index += 1
         self.sample = self.dataset[self.task_index % len(self.dataset)]
-        self.pr = CodeReviewPullRequest(**self.sample["pr"])
-        self.gt = self.sample["ground_truth"]
         self.task_type = self.sample.get("task_type", "unknown")
-        self.history = []
-        self.step_count = 0
-        self.done = False
-        # State evolution variables
-        self.issues_identified = []
-        self.fix_attempted = False
         return CodeReviewObservation(
-            # echoed_message="Code Review environment ready!",
             pr=self.pr,
             previous_comments=self.history,
             step_count=self.step_count,
@@ -130,41 +102,34 @@ class CodeReviewEnvironment(Environment):
             done=False,
         )
-    def step(self, action: CodeReviewAction) -> CodeReviewObservation:  # type: ignore[override]
-        """
-        Execute a step in the environment by echoing the message.
-        Args:
-            action: CodeReviewAction containing the message to echo
-        Returns:
-            CodeReviewObservation with the echoed message and its length
-        """
         self._state.step_count += 1
-        # print("RAW ACTION TYPE:", type(action))
-        # print("RAW ACTION:", action)
         try:
             if isinstance(action, dict):
                 action = CodeReviewAction(**action)
             elif isinstance(action, (list, tuple)):
                 action = CodeReviewAction(
                     action_type=action[0],
-                    comment=action[1] if len(action) > 1 else None,
                     suggested_code=action[2] if len(action) > 2 else None,
-                    decision=action[3] if len(action) > 3 else None,
                 )
             elif isinstance(action, CodeReviewAction):
                 pass
             else:
                 raise ValueError(f"Unsupported action type: {type(action)}")
         except Exception as e:
-            print(f"Error occurred while processing action: {e}")
             return self._invalid_step()
         self.step_count += 1
         self.history.append(action)
@@ -174,79 +139,54 @@ class CodeReviewEnvironment(Environment):
         if action.action_type == "suggest_fix":
             self.fix_attempted = True
-        score = self.grade_action(action, self.gt)
-        # print(f"Step {self.step_count} - Score: {score:.4f}")
-        bonus = 0.0
-        # Encourage meaningful comments
-        if action.comment and len(action.comment) > 30:
-            bonus += 0.1
-        # Encourage early correct decisions
-        if action.action_type == "final_decision" and self.step_count <= 2:
-            bonus += 0.1
-        # Penalize useless steps
-        if not action.comment and action.action_type != "final_decision":
-            bonus -= 0.1
-        # Penalize long trajectories
-        if self.step_count > 3:
-            bonus -= 0.05
-        score += bonus
-        score = max(0.01, min(score, 0.99))
-        # print("Final Score == " , score)
         done = (
-            action.action_type == "final_decision" or self.step_count >= self.max_steps
         )
         if done:
-            score = max([self.grade_action(a, self.gt) for a in self.history] or [0.0])
-            score = max(0.01, min(score, 0.99))
         obs = CodeReviewObservation(
             pr=self.pr,
             previous_comments=[a.comment for a in self.history if a.comment],
             step_count=self.step_count,
             max_steps=self.max_steps,
         )
-        # print("Obs == " , obs)
         rew = CodeReviewReward(score=score, feedback="graded")
-        print("Score == ", type(rew.score), " --- ", rew.score)
-        # print("FINAL REWARD TYPE:", type(rew))
-        # print("FINAL REWARD:", rew)
-        # print("Got the culprit I guess....")
         return CodeReviewStepResponse(
             observation=obs,
             reward=rew.score,
             done=done,
             info={
-                "task_type": self.task_type,
                 "issues_identified": len(self.issues_identified),
-                "fix_attempted": self.fix_attempted,
             },
         )
     @property
     def state(self) -> State:
-        """
-        Get the current environment state.
-        Returns:
-            Current State with episode_id and step_count
-        """
         return self._state
-    def _invalid_step(self):
         rew = CodeReviewReward(score=0.0, feedback="invalid action")
         obs = CodeReviewObservation(
-            echoed_message="Invalid action format. Please send a valid CodeReviewAction.",
             pr=self.pr,
             previous_comments=[a.comment for a in self.history if a.comment],
             step_count=self.step_count,
@@ -257,116 +197,4 @@ class CodeReviewEnvironment(Environment):
             reward=rew,
             done=True,
             info={"error": "invalid_action"},
-        )
-    def grade_action(self, action, ground_truth):
-        score = 0.0
-        # print("Action === ", action)
-        # print("Ground truth === ", ground_truth)
-        # ------------------------------
-        # ISSUE DETECTION (40%)
-        # ------------------------------
-        issue_score = self.score_issues(action.comment, ground_truth)
-        score += 0.4 * issue_score
-        # print("After Issue Score == ", issue_score)
-        # ------------------------------
-        # FIX QUALITY (30%)
-        # ------------------------------
-        fix_score = self.score_fix(action.suggested_code, ground_truth)
-        score += 0.3 * fix_score
-        # print("After Fix Score == ", fix_score)
-        # ------------------------------
-        # DECISION (30%)
-        # ------------------------------
-        decision_score = self.score_decision(action, ground_truth)
-        score += 0.3 * decision_score
-        # print("After Decision Score == ", decision_score)
-        # ------------------------------
-        # CLAMP SCORE
-        # ------------------------------
-        score = max(0.01, min(score, 0.99))
-        return score
-    def normalize(self, text):
-        return (text or "").lower().strip()
-    # ==============================
-    # ISSUE MATCH (PARTIAL CREDIT)
-    # ==============================
-    def score_issues(self, comment, ground_truth):
-        issues = ground_truth.get("issues", [])
-        if not comment or not issues:
-            return 0.0
-        comment = self.normalize(comment)
-        matches = sum(1 for issue in issues if self.normalize(issue) in comment)
-        return matches / len(issues)
-    # ==============================
-    # FIX MATCH (FUZZY)
-    # ==============================
-    def score_fix(self, suggested_code: str, ground_truth: dict) -> float:
-        if not suggested_code:
-            return 0.0
-        expected_fix = self.normalize(ground_truth.get("fix", ""))
-        suggested_code = self.normalize(suggested_code)
-        if not expected_fix:
-            return 0.0
-        # 1. Exact / substring match — full score
-        if expected_fix in suggested_code:
-            return 1.0
-        # 2. Token overlap ignoring stop words
-        def code_tokens(text: str) -> list[str]:
-            tokens = re.findall(r"[a-zA-Z_]\w*|\d+|[=<>!+\-*/]+", text)
-            return [t for t in tokens if t.lower() not in STOP_WORDS]
-        expected_tokens = code_tokens(expected_fix)
-        suggested_tokens = set(code_tokens(suggested_code))
-        if not expected_tokens:
-            return 0.0
-        token_score = sum(1 for t in expected_tokens if t in suggested_tokens) / len(
-            expected_tokens
-        )
-        # 3. Sequence similarity as a secondary signal
-        seq_score = SequenceMatcher(None, expected_fix, suggested_code).ratio()
-        # Weighted: token overlap matters more than character similarity
-        return round(0.7 * token_score + 0.3 * seq_score, 4)
-    # ==============================
-    # DECISION MATCH
-    # ==============================
-    def score_decision(self, action, ground_truth):
-        expected = ground_truth.get("decision")
-        # Not a decision step → no contribution
-        if action.action_type != "final_decision":
-            return 0.0
-        # Missing decision → small penalty
-        if not action.decision:
-            return 0.0
-        # Correct decision
-        if action.decision == expected:
-            return 1.0
-        # Wrong decision → partial penalty (not negative)
-        return 0.2

 """
 Code Review Environment Implementation.
+Supports three grader difficulty levels: "easy", "medium", "hard".
+Pass `grader_level` to the constructor to select the desired tier.
 """
 from uuid import uuid4
 import json
 from pathlib import Path
+try:
+    from .graders import get_grader
+except ImportError:
+    from graders import get_grader
+dataset_path = Path(__file__).parent.parent / "dataset" / "dataset.json"
 class CodeReviewEnvironment(Environment):
     """
+    Code Review environment with configurable grading difficulty.
+    Args:
+        grader_level: Grading difficulty — one of "easy", "medium", "hard".
+                      Defaults to "medium".
     Example:
+        >>> env = CodeReviewEnvironment(grader_level="hard")
         >>> obs = env.reset()
+        >>> obs = env.step(CodeReviewAction(action_type="final_decision", decision="approve"))
     """
     SUPPORTS_CONCURRENT_SESSIONS: bool = True
+    def __init__(self, grader_level: str = "medium"):
+        """Initialise the environment with the chosen grader tier."""
         self._state = State(episode_id=str(uuid4()), step_count=0)
         self._reset_count = 0
         self.max_steps = 5
         self.task_index = 0
         with open(dataset_path) as f:
             self.dataset = json.load(f)
         self.reset()
     def reset(self) -> CodeReviewObservation:
+        """Reset the environment and advance to the next task."""
         self._state = State(episode_id=str(uuid4()), step_count=0)
         self._reset_count += 1
         self.task_index += 1
         self.sample = self.dataset[self.task_index % len(self.dataset)]
+        self.pr        = CodeReviewPullRequest(**self.sample["pr"])
+        self.gt        = self.sample["ground_truth"]
         self.task_type = self.sample.get("task_type", "unknown")
+        grader_level = self.task_type if self.task_type in ("easy", "medium", "hard") else "medium"
+        self.grader = get_grader(grader_level)
+        self.grader_level = grader_level
+        self.history            = []
+        self.step_count         = 0
+        self.done               = False
+        self.issues_identified  = []
+        self.fix_attempted      = False
         return CodeReviewObservation(
             pr=self.pr,
             previous_comments=self.history,
             step_count=self.step_count,
             done=False,
         )
+    def step(self, action: CodeReviewAction) -> CodeReviewStepResponse:  # type: ignore[override]
+        """Execute one step: grade the action and return an observation + reward."""
         self._state.step_count += 1
+        # ------------------------------------------------------------------
+        # Normalise action into a CodeReviewAction object
+        # ------------------------------------------------------------------
         try:
             if isinstance(action, dict):
                 action = CodeReviewAction(**action)
             elif isinstance(action, (list, tuple)):
                 action = CodeReviewAction(
                     action_type=action[0],
+                    comment=action[1]      if len(action) > 1 else None,
                     suggested_code=action[2] if len(action) > 2 else None,
+                    decision=action[3]     if len(action) > 3 else None,
                 )
             elif isinstance(action, CodeReviewAction):
                 pass
             else:
                 raise ValueError(f"Unsupported action type: {type(action)}")
         except Exception as e:
+            print(f"Error processing action: {e}")
             return self._invalid_step()
+        # ------------------------------------------------------------------
+        # Update state
+        # ------------------------------------------------------------------
         self.step_count += 1
         self.history.append(action)
         if action.action_type == "suggest_fix":
             self.fix_attempted = True
+        # ------------------------------------------------------------------
+        # Score via the active grader
+        # ------------------------------------------------------------------
+        score = self.grader.grade_action(action, self.gt)
+        bonus = self.grader.compute_step_bonus(action, self.step_count, self.history)
+        score = max(0.01, min(score + bonus, 0.99))
         done = (
+            action.action_type == "final_decision"
+            or self.step_count >= self.max_steps
         )
         if done:
+            score = self.grader.compute_done_score(self.history, self.gt)
+        # ------------------------------------------------------------------
+        # Build response
+        # ------------------------------------------------------------------
         obs = CodeReviewObservation(
             pr=self.pr,
             previous_comments=[a.comment for a in self.history if a.comment],
             step_count=self.step_count,
             max_steps=self.max_steps,
         )
         rew = CodeReviewReward(score=score, feedback="graded")
+        # print(f"[{self.grader_level.upper()}] Step {self.step_count} — Score: {rew.score:.4f}")
         return CodeReviewStepResponse(
             observation=obs,
             reward=rew.score,
             done=done,
             info={
+                "grader_level":      self.grader_level,
+                "task_type":         self.task_type,
                 "issues_identified": len(self.issues_identified),
+                "fix_attempted":     self.fix_attempted,
             },
         )
     @property
     def state(self) -> State:
         return self._state
+    def _invalid_step(self) -> CodeReviewStepResponse:
         rew = CodeReviewReward(score=0.0, feedback="invalid action")
         obs = CodeReviewObservation(
             pr=self.pr,
             previous_comments=[a.comment for a in self.history if a.comment],
             step_count=self.step_count,
             reward=rew,
             done=True,
             info={"error": "invalid_action"},
+        )

server/graders.py ADDED Viewed

	@@ -0,0 +1,353 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Graders for the Code Review Environment.
+Three difficulty tiers:
+  - EasyGrader   : Forgiving. Substring matching, partial credit for wrong decisions.
+  - MediumGrader : Balanced. Token overlap, line-level fix matching, recency weighting.
+  - HardGrader   : Strict.   No wrong-decision credit, final-step-only done scoring.
+"""
+import re
+from difflib import SequenceMatcher
+STOP_WORDS = {
+    "use", "the", "a", "an", "to", "and", "or", "of", "in",
+    "for", "with", "is", "it", "on", "at", "by", "from", "that",
+}
+def _normalize(text: str) -> str:
+    return (text or "").lower().strip()
+def _code_tokens(text: str) -> list[str]:
+    tokens = re.findall(r"[a-zA-Z_]\w*|\d+|[=<>!+\-*/]+", text)
+    return [t for t in tokens if t.lower() not in STOP_WORDS]
+# ==============================================================================
+# Base Grader
+# ==============================================================================
+class BaseGrader:
+    """
+    Shared helpers. Subclasses override score_* and compute_* methods
+    to implement their difficulty level.
+    """
+    # Subclasses set these to configure weights (must sum to 1.0)
+    ISSUE_WEIGHT: float = 0.40
+    FIX_WEIGHT: float = 0.30
+    DECISION_WEIGHT: float = 0.30
+    def grade_action(self, action, ground_truth: dict) -> float:
+        score = (
+            self.ISSUE_WEIGHT   * self.score_issues(action.comment, ground_truth)
+            + self.FIX_WEIGHT   * self.score_fix(action.suggested_code, ground_truth)
+            + self.DECISION_WEIGHT * self.score_decision(action, ground_truth)
+        )
+        return max(0.01, min(score, 0.99))
+    def score_issues(self, comment: str, ground_truth: dict) -> float:
+        raise NotImplementedError
+    def score_fix(self, suggested_code: str, ground_truth: dict) -> float:
+        raise NotImplementedError
+    def score_decision(self, action, ground_truth: dict) -> float:
+        raise NotImplementedError
+    def compute_step_bonus(self, action, step_count: int, history: list) -> float:
+        raise NotImplementedError
+    def compute_done_score(self, history: list, ground_truth: dict) -> float:
+        raise NotImplementedError
+# ==============================================================================
+# Easy Grader
+# ==============================================================================
+class EasyGrader(BaseGrader):
+    """
+    Lenient grader. Best for round-1 filtering / warm-up tasks.
+    - Issue detection : simple substring match
+    - Fix quality     : token overlap + sequence similarity
+    - Wrong decision  : 0.2 partial credit
+    - Done scoring    : max over entire history (most forgiving)
+    - Bonuses         : generous, long trajectories are acceptable
+    Weights: issues=40%, fix=30%, decision=30%
+    """
+    ISSUE_WEIGHT    = 0.40
+    FIX_WEIGHT      = 0.30
+    DECISION_WEIGHT = 0.30
+    def score_issues(self, comment: str, ground_truth: dict) -> float:
+        issues = ground_truth.get("issues", [])
+        if not comment or not issues:
+            return 0.0
+        comment_norm = _normalize(comment)
+        matches = sum(1 for issue in issues if _normalize(issue) in comment_norm)
+        return matches / len(issues)
+    def score_fix(self, suggested_code: str, ground_truth: dict) -> float:
+        if not suggested_code:
+            return 0.0
+        expected  = _normalize(ground_truth.get("fix", ""))
+        suggested = _normalize(suggested_code)
+        if not expected:
+            return 0.0
+        if expected in suggested:
+            return 1.0
+        exp_tok = _code_tokens(expected)
+        sug_tok = set(_code_tokens(suggested))
+        token_score = (
+            sum(1 for t in exp_tok if t in sug_tok) / len(exp_tok) if exp_tok else 0.0
+        )
+        seq_score = SequenceMatcher(None, expected, suggested).ratio()
+        return round(0.7 * token_score + 0.3 * seq_score, 4)
+    def score_decision(self, action, ground_truth: dict) -> float:
+        if action.action_type != "final_decision" or not action.decision:
+            return 0.0
+        if action.decision == ground_truth.get("decision"):
+            return 1.0
+        return 0.2  # generous partial credit for wrong decision
+    def compute_step_bonus(self, action, step_count: int, history: list) -> float:
+        bonus = 0.0
+        if action.comment and len(action.comment) > 30:
+            bonus += 0.15
+        if action.action_type == "final_decision" and step_count <= 3:
+            bonus += 0.10
+        if not action.comment and action.action_type != "final_decision":
+            bonus -= 0.05
+        return bonus
+    def compute_done_score(self, history: list, ground_truth: dict) -> float:
+        """Most forgiving: best single action across all of history."""
+        scores = [self.grade_action(a, ground_truth) for a in history] or [0.0]
+        return max(0.01, min(max(scores), 0.99))
+# ==============================================================================
+# Medium Grader
+# ==============================================================================
+class MediumGrader(BaseGrader):
+    """
+    Balanced grader. Suitable for main competition rounds.
+    - Issue detection : token overlap + substring fallback
+    - Fix quality     : token overlap + line-level + sequence similarity
+    - Wrong decision  : 0.1 partial credit
+    - Done scoring    : recency-weighted (recent actions matter more)
+    - Bonuses         : moderate, efficiency is rewarded
+    Weights: issues=42%, fix=30%, decision=28%
+    """
+    ISSUE_WEIGHT    = 0.42
+    FIX_WEIGHT      = 0.30
+    DECISION_WEIGHT = 0.28
+    def score_issues(self, comment: str, ground_truth: dict) -> float:
+        issues = ground_truth.get("issues", [])
+        if not comment or not issues:
+            return 0.0
+        comment_text   = _normalize(comment)
+        comment_tokens = set(re.findall(r"[a-zA-Z_]\w*", comment_text)) - STOP_WORDS
+        best_scores = []
+        for issue in issues:
+            issue_text   = _normalize(issue)
+            issue_tokens = set(re.findall(r"[a-zA-Z_]\w*", issue_text)) - STOP_WORDS
+            if not issue_tokens:
+                continue
+            overlap      = len(issue_tokens & comment_tokens) / len(issue_tokens)
+            substring    = 1.0 if issue_text in comment_text else 0.0
+            best_scores.append(max(overlap, substring))
+        return round(sum(best_scores) / len(issues), 4) if best_scores else 0.0
+    def score_fix(self, suggested_code: str, ground_truth: dict) -> float:
+        if not suggested_code:
+            return 0.0
+        expected  = _normalize(ground_truth.get("fix", ""))
+        suggested = _normalize(suggested_code)
+        if not expected:
+            return 0.0
+        if expected in suggested:
+            return 1.0
+        exp_lines = [l.strip() for l in expected.splitlines()  if l.strip()]
+        sug_lines = [l.strip() for l in suggested.splitlines() if l.strip()]
+        line_score = (
+            sum(1 for l in exp_lines if l in sug_lines) / len(exp_lines)
+            if exp_lines else 0.0
+        )
+        exp_tok = _code_tokens(expected)
+        sug_tok = set(_code_tokens(suggested))
+        token_score = (
+            sum(1 for t in exp_tok if t in sug_tok) / len(exp_tok) if exp_tok else 0.0
+        )
+        seq_score = SequenceMatcher(None, expected, suggested).ratio()
+        return round(0.4 * token_score + 0.3 * seq_score + 0.3 * line_score, 4)
+    def score_decision(self, action, ground_truth: dict) -> float:
+        if action.action_type != "final_decision" or not action.decision:
+            return 0.0
+        if action.decision == ground_truth.get("decision"):
+            return 1.0
+        return 0.1  # reduced partial credit
+    def compute_step_bonus(self, action, step_count: int, history: list) -> float:
+        bonus = 0.0
+        if action.comment and len(action.comment) > 40:
+            bonus += 0.10
+        if action.action_type == "final_decision":
+            if step_count == 1:
+                bonus += 0.10
+            elif step_count == 2:
+                bonus += 0.05
+        if step_count > 3:
+            bonus -= 0.04
+        if not action.comment and action.action_type != "final_decision":
+            bonus -= 0.08
+        return bonus
+    def compute_done_score(self, history: list, ground_truth: dict) -> float:
+        """Recency-weighted: later actions in history count for more."""
+        n = max(len(history), 1)
+        weighted = [
+            self.grade_action(a, ground_truth) * (0.6 + 0.4 * (i / n))
+            for i, a in enumerate(history)
+        ]
+        return max(0.01, min(max(weighted), 0.99))
+# ==============================================================================
+# Hard Grader
+# ==============================================================================
+class HardGrader(BaseGrader):
+    """
+    Strict grader. For finals / advanced rounds.
+    - Issue detection : token overlap + seq similarity with a minimum threshold
+    - Fix quality     : line-level match dominant, no free token credit
+    - Wrong decision  : 0.0 (no credit at all)
+    - Done scoring    : final step only (harshest)
+    - Bonuses         : minimal, escalating penalty for long trajectories
+    Weights: issues=45%, fix=28%, decision=27%
+    """
+    ISSUE_WEIGHT    = 0.45
+    FIX_WEIGHT      = 0.28
+    DECISION_WEIGHT = 0.27
+    # Minimum combined score an issue match must clear to get any credit
+    ISSUE_THRESHOLD = 0.30
+    def score_issues(self, comment: str, ground_truth: dict) -> float:
+        issues = ground_truth.get("issues", [])
+        if not comment or not issues:
+            return 0.0
+        comment_text   = _normalize(comment)
+        comment_tokens = set(re.findall(r"[a-zA-Z_]\w*", comment_text)) - STOP_WORDS
+        scores = []
+        for issue in issues:
+            issue_text   = _normalize(issue)
+            issue_tokens = set(re.findall(r"[a-zA-Z_]\w*", issue_text)) - STOP_WORDS
+            if not issue_tokens:
+                continue
+            token_overlap = len(issue_tokens & comment_tokens) / len(issue_tokens)
+            seq_sim       = SequenceMatcher(None, issue_text, comment_text).ratio()
+            combined      = 0.7 * token_overlap + 0.3 * seq_sim
+            # Must clear threshold to get any credit — no partial reward for vague hints
+            scores.append(combined if combined >= self.ISSUE_THRESHOLD else 0.0)
+        return round(sum(scores) / len(issues), 4) if scores else 0.0
+    def score_fix(self, suggested_code: str, ground_truth: dict) -> float:
+        if not suggested_code:
+            return 0.0
+        expected  = _normalize(ground_truth.get("fix", ""))
+        suggested = _normalize(suggested_code)
+        if not expected:
+            return 0.0
+        if expected in suggested:
+            return 1.0
+        exp_lines = [l.strip() for l in expected.splitlines()  if l.strip()]
+        sug_lines = set(l.strip() for l in suggested.splitlines() if l.strip())
+        line_score = (
+            sum(1 for l in exp_lines if l in sug_lines) / len(exp_lines)
+            if exp_lines else 0.0
+        )
+        exp_tok = _code_tokens(expected)
+        sug_tok = set(_code_tokens(suggested))
+        token_score = (
+            sum(1 for t in exp_tok if t in sug_tok) / len(exp_tok) if exp_tok else 0.0
+        )
+        seq_score = SequenceMatcher(None, expected, suggested).ratio()
+        # Line-level match is dominant in hard mode
+        return round(0.5 * line_score + 0.3 * token_score + 0.2 * seq_score, 4)
+    def score_decision(self, action, ground_truth: dict) -> float:
+        if action.action_type != "final_decision" or not action.decision:
+            return 0.0
+        return 1.0 if action.decision == ground_truth.get("decision") else 0.0
+    def compute_step_bonus(self, action, step_count: int, history: list) -> float:
+        bonus = 0.0
+        if action.action_type == "final_decision" and step_count == 1:
+            bonus += 0.05  # only reward decisive first-step finishes
+        if step_count > 2:
+            bonus -= 0.05 * (step_count - 2)  # escalating penalty
+        if not action.comment and action.action_type != "final_decision":
+            bonus -= 0.12
+        return bonus
+    def compute_done_score(self, history: list, ground_truth: dict) -> float:
+        """Strictest: only the final action in the episode counts."""
+        if not history:
+            return 0.01
+        return max(0.01, min(self.grade_action(history[-1], ground_truth), 0.99))
+# ==============================================================================
+# Factory
+# ==============================================================================
+GRADER_REGISTRY: dict[str, type[BaseGrader]] = {
+    "easy":   EasyGrader,
+    "medium": MediumGrader,
+    "hard":   HardGrader,
+}
+def get_grader(level: str = "medium") -> BaseGrader:
+    """
+    Return a grader instance for the given difficulty level.
+    Args:
+        level: One of "easy", "medium", or "hard".
+    Returns:
+        An instantiated grader.
+    Raises:
+        ValueError: If the level is not recognised.
+    """
+    level = level.lower()
+    if level not in GRADER_REGISTRY:
+        raise ValueError(
+            f"Unknown grader level '{level}'. Choose from: {list(GRADER_REGISTRY)}"
+        )
+    return GRADER_REGISTRY[level]()