Spaces:

SimranShaikh
/

code-review-env

Sleeping

App Files Files Community

SimranShaikh commited on 7 days ago

Commit

bdbb08b

verified ·

1 Parent(s): 220b4a7

commit

Browse files

Files changed (1) hide show

environment/graders.py +259 -0

environment/graders.py ADDED Viewed

	@@ -0,0 +1,259 @@

+"""
+Deterministic graders for each task.
+All graders return a score in [0.0, 1.0] and a feedback message.
+"""
+from typing import List, Tuple
+from environment.models import CodeReviewAction, Issue
+# ─────────────────────────────────────────────────────────────
+# Helpers
+# ─────────────────────────────────────────────────────────────
+def _keyword_hit(text: str, keywords: List[str]) -> bool:
+    """Case-insensitive check — does `text` contain any keyword?"""
+    text_lower = text.lower()
+    return any(kw.lower() in text_lower for kw in keywords)
+def _keyword_score(text: str, keywords: List[str]) -> float:
+    """Fraction of keywords found in text (0.0 – 1.0)."""
+    if not keywords:
+        return 0.0
+    hits = sum(1 for kw in keywords if kw.lower() in text.lower())
+    return hits / len(keywords)
+def _issue_text(issues: List[Issue]) -> str:
+    """Concatenate all issue fields into a single string for matching."""
+    parts = []
+    for issue in issues:
+        parts.append(issue.issue_type)
+        parts.append(issue.description)
+        if issue.line_number is not None:
+            parts.append(str(issue.line_number))
+    return " ".join(parts).lower()
+# ─────────────────────────────────────────────────────────────
+# Easy grader: syntax error detection
+# ─────────────────────────────────────────────────────────────
+def grade_easy(action: CodeReviewAction, ground_truth: dict) -> Tuple[float, str]:
+    """
+    Rubric (total 1.0):
+      0.35 — identified issue_type == "syntax_error"
+      0.35 — description mentions the relevant keywords (colon / if / syntax)
+      0.30 — suggested_fix contains the corrected line
+    """
+    score = 0.0
+    feedback_parts = []
+    issue_types = [i.issue_type.lower() for i in action.identified_issues]
+    all_text = _issue_text(action.identified_issues) + " " + (action.explanation or "")
+    # 1) Issue type check
+    if "syntax_error" in issue_types:
+        score += 0.35
+        feedback_parts.append("✅ Correctly identified as a syntax error (+0.35)")
+    else:
+        feedback_parts.append(
+            f"❌ Expected issue_type='syntax_error', got {issue_types} (+0.00)"
+        )
+    # 2) Description keyword check
+    kw_score = _keyword_score(all_text, ground_truth["keywords"])
+    desc_points = round(0.35 * min(kw_score * 2, 1.0), 3)
+    score += desc_points
+    feedback_parts.append(
+        f"{'✅' if desc_points > 0.1 else '❌'} Description accuracy: "
+        f"{desc_points:.2f}/0.35 (keyword match {kw_score:.0%})"
+    )
+    # 3) Fix quality check
+    fix = action.suggested_fix or ""
+    if _keyword_hit(fix, ground_truth["fix_keywords"]):
+        score += 0.30
+        feedback_parts.append("✅ Suggested fix contains the correct patch (+0.30)")
+    else:
+        feedback_parts.append("❌ Suggested fix missing or incorrect (+0.00)")
+    score = round(min(score, 1.0), 4)
+    return score, "\n".join(feedback_parts)
+# ─────────────────────────────────────────────────────────────
+# Medium grader: logic bug detection
+# ─────────────────────────────────────────────────────────────
+def _run_is_palindrome(code: str) -> List[Tuple[str, bool, bool]]:
+    """
+    Execute the patched `is_palindrome` function in a subprocess-safe sandbox.
+    Returns list of (input, expected, actual).
+    """
+    import subprocess, sys, json, textwrap
+    test_driver = textwrap.dedent(f"""
+{code}
+import json, sys
+cases = [
+    ("racecar", True),
+    ("hello", False),
+    ("amanaplanacanalpanama", True),
+    ("abba", True),
+    ("abc", False),
+]
+results = []
+for inp, exp in cases:
+    try:
+        got = is_palindrome(inp)
+        results.append([inp, exp, bool(got)])
+    except Exception as e:
+        results.append([inp, exp, None])
+print(json.dumps(results))
+""")
+    try:
+        out = subprocess.run(
+            [sys.executable, "-c", test_driver],
+            capture_output=True, text=True, timeout=5
+        )
+        if out.returncode != 0:
+            return []
+        return [tuple(r) for r in json.loads(out.stdout.strip())]
+    except Exception:
+        return []
+def grade_medium(action: CodeReviewAction, ground_truth: dict) -> Tuple[float, str]:
+    """
+    Rubric (total 1.0):
+      0.25 — identified issue_type == "logic_bug"
+      0.25 — description mentions index / off-by-one keywords
+      0.50 — suggested fix passes all 5 test cases (0.10 each)
+    """
+    score = 0.0
+    feedback_parts = []
+    issue_types = [i.issue_type.lower() for i in action.identified_issues]
+    all_text = _issue_text(action.identified_issues) + " " + (action.explanation or "")
+    # 1) Issue type
+    if "logic_bug" in issue_types:
+        score += 0.25
+        feedback_parts.append("✅ Correctly identified as a logic bug (+0.25)")
+    else:
+        feedback_parts.append(f"❌ Expected 'logic_bug', got {issue_types} (+0.00)")
+    # 2) Description accuracy
+    kw_score = _keyword_score(all_text, ground_truth["keywords"])
+    desc_pts = round(0.25 * min(kw_score * 2.5, 1.0), 3)
+    score += desc_pts
+    feedback_parts.append(
+        f"{'✅' if desc_pts > 0.08 else '❌'} Description accuracy: "
+        f"{desc_pts:.2f}/0.25 (keyword match {kw_score:.0%})"
+    )
+    # 3) Fix execution test
+    fix_code = action.suggested_fix or ""
+    if fix_code.strip():
+        results = _run_is_palindrome(fix_code)
+        if results:
+            passed = sum(1 for inp, exp, got in results if got == exp)
+            pts = round(0.50 * (passed / len(results)), 3)
+            score += pts
+            feedback_parts.append(
+                f"{'✅' if passed == len(results) else '⚠️'} Fix passed "
+                f"{passed}/{len(results)} test cases → +{pts:.2f}/0.50"
+            )
+            for inp, exp, got in results:
+                status = "✅" if got == exp else "❌"
+                feedback_parts.append(f"    {status} is_palindrome({inp!r}) → {got} (expected {exp})")
+        else:
+            feedback_parts.append("❌ Fix code could not be executed (+0.00)")
+    else:
+        feedback_parts.append("❌ No suggested fix provided (+0.00)")
+    score = round(min(score, 1.0), 4)
+    return score, "\n".join(feedback_parts)
+# ─────────────────────────────────────────────────────────────
+# Hard grader: security vulnerability detection
+# ─────────────────────────────────────────────────────────────
+def grade_hard(action: CodeReviewAction, ground_truth: dict) -> Tuple[float, str]:
+    """
+    Rubric (total 1.0) — 3 vulnerabilities, each worth ~0.33:
+      Per vulnerability:
+        0.15 — identified as security_vulnerability
+        0.10 — description mentions relevant keywords
+        0.08 — fix mentions remediation keywords
+    Bonus 0.05 for finding all 3 and providing a complete fixed file.
+    """
+    vulns = ground_truth["vulnerabilities"]
+    per_vuln = 1.0 / len(vulns)
+    all_issue_text = _issue_text(action.identified_issues) + " " + (action.explanation or "")
+    fix_text = (action.suggested_fix or "") + " " + (action.explanation or "")
+    total_score = 0.0
+    feedback_parts = []
+    found_count = 0
+    for vuln in vulns:
+        v_score = 0.0
+        v_name = vuln["name"]
+        feedback_parts.append(f"\n🔍 Checking: {v_name}")
+        # a) issue_type == security_vulnerability
+        sec_issues = [i for i in action.identified_issues if "security" in i.issue_type.lower()]
+        if sec_issues:
+            v_score += per_vuln * 0.45
+            feedback_parts.append(f"  ✅ Flagged as security vulnerability (+{per_vuln*0.45:.3f})")
+        else:
+            feedback_parts.append(f"  ❌ Not flagged as security vulnerability (+0.00)")
+        # b) description keyword match
+        kw_hit = _keyword_hit(all_issue_text, vuln["keywords"])
+        if kw_hit:
+            kw_score_val = _keyword_score(all_issue_text, vuln["keywords"])
+            pts = round(per_vuln * 0.30 * min(kw_score_val * 3, 1.0), 4)
+            v_score += pts
+            feedback_parts.append(f"  ✅ Identified '{v_name}' in description (+{pts:.3f})")
+            found_count += 1
+        else:
+            feedback_parts.append(f"  ❌ '{v_name}' not mentioned in description (+0.00)")
+        # c) fix keyword match
+        fix_hit = _keyword_hit(fix_text, vuln["fix_keywords"])
+        if fix_hit:
+            v_score += per_vuln * 0.25
+            feedback_parts.append(f"  ✅ Fix addresses '{v_name}' (+{per_vuln*0.25:.3f})")
+        else:
+            feedback_parts.append(f"  ❌ Fix doesn't address '{v_name}' (+0.00)")
+        total_score += v_score
+    # Bonus: found all 3
+    if found_count == len(vulns):
+        total_score = min(total_score + 0.05, 1.0)
+        feedback_parts.append("\n🎯 Bonus: All 3 vulnerabilities identified! (+0.05)")
+    total_score = round(min(total_score, 1.0), 4)
+    return total_score, "\n".join(feedback_parts)
+# ────────────────────────────────────────────────��────────────
+# Dispatcher
+# ─────────────────────────────────────────────────────────────
+def grade(task_id: str, action: CodeReviewAction, ground_truth: dict) -> Tuple[float, str]:
+    if task_id == "easy_syntax":
+        return grade_easy(action, ground_truth)
+    elif task_id == "medium_logic":
+        return grade_medium(action, ground_truth)
+    elif task_id == "hard_security":
+        return grade_hard(action, ground_truth)
+    else:
+        raise ValueError(f"No grader for task: {task_id}")