Spaces:
Sleeping
Sleeping
| """ | |
| LLM Judge β reasoning quality scorer for WhyDidItFail. | |
| Called from inference.py after submit_diagnosis. | |
| Uses the same OpenAI-compatible client and model as the agent. | |
| Returns a normalized score in [0.10, 0.90] representing reasoning quality. | |
| Returns None silently if reasoning is absent or the call fails. | |
| Scoring criteria (0β5 each, total 0β15 β normalized to 0.10β0.90): | |
| evidence_grounding β does the reasoning cite specific observed values? | |
| causal_chain β does it connect evidence to the failure mode logically? | |
| fix_rationale β is the fix justified by the evidence? | |
| Final score in inference.py: | |
| total = 0.85 * keyword_score + 0.15 * judge_score β always in [0.10, 0.90] | |
| """ | |
| import json | |
| from openai import OpenAI | |
| def _build_prompt( | |
| diagnosis: str, | |
| suggested_fix: str | None, | |
| reasoning: str, | |
| scenario: dict, | |
| inspection_order: list[str], | |
| ) -> str: | |
| seen: dict = {} | |
| if "logs" in inspection_order: | |
| seen["training_logs"] = scenario.get("logs", []) | |
| if "config" in inspection_order: | |
| seen["config"] = scenario.get("config", {}) | |
| if "gradients" in inspection_order: | |
| seen["gradient_norms"] = scenario.get("gradient_norms", None) | |
| return f"""You are evaluating the reasoning of an ML debugging agent. | |
| Agent submission: | |
| Diagnosis: {diagnosis} | |
| Suggested fix: {suggested_fix or "none provided"} | |
| Reasoning: {reasoning} | |
| Data the agent had access to: | |
| {json.dumps(seen, indent=2)} | |
| Score the reasoning (integers only): | |
| evidence_grounding (0-5): Does the reasoning cite specific values from the data above? | |
| causal_chain (0-5): Does it logically connect that evidence to the diagnosed failure mode? | |
| fix_rationale (0-5): Is the fix directly justified by the evidence and diagnosis? | |
| Respond with JSON only, no explanation: | |
| {{"evidence_grounding": <int>, "causal_chain": <int>, "fix_rationale": <int>}}""" | |
| def judge( | |
| client: OpenAI, | |
| model: str, | |
| diagnosis: str, | |
| reasoning: str | None, | |
| suggested_fix: str | None, | |
| scenario: dict, | |
| inspection_order: list[str], | |
| ) -> float | None: | |
| """Score reasoning quality. Returns 0.10β0.90, or None if unavailable/failed. | |
| None signals the caller to skip judge weighting entirely and use the | |
| keyword score at full weight (1.0) rather than 0.85. | |
| """ | |
| if not reasoning or not reasoning.strip(): | |
| return None | |
| try: | |
| completion = client.chat.completions.create( | |
| model=model, | |
| messages=[ | |
| {"role": "user", "content": _build_prompt( | |
| diagnosis, suggested_fix, reasoning, scenario, inspection_order | |
| )}, | |
| ], | |
| temperature=0.0, | |
| max_tokens=64, | |
| ) | |
| text = (completion.choices[0].message.content or "").strip() | |
| data = json.loads(text) | |
| raw = ( | |
| data.get("evidence_grounding", 0) | |
| + data.get("causal_chain", 0) | |
| + data.get("fix_rationale", 0) | |
| ) | |
| # normalize: raw 0β15 β 0.10β0.90 (never below 0.10 or above 0.90) | |
| return round(max(0.10, min(0.90, raw / 15)), 2) | |
| except Exception as exc: | |
| print(f" [JUDGE] failed: {exc}", flush=True) | |
| return None |