Spaces:

samrat-rm
/

WhyDidItFail

Sleeping

App Files Files Community

WhyDidItFail / server /llm_judge.py

samrat-rm

chore: update doc string

6b279f6 13 days ago

raw

history blame contribute delete

3.28 kB

	"""
	LLM Judge — reasoning quality scorer for WhyDidItFail.

	Called from inference.py after submit_diagnosis.
	Uses the same OpenAI-compatible client and model as the agent.
	Returns a normalized score in [0.10, 0.90] representing reasoning quality.
	Returns None silently if reasoning is absent or the call fails.

	Scoring criteria (0–5 each, total 0–15 → normalized to 0.10–0.90):
	evidence_grounding — does the reasoning cite specific observed values?
	causal_chain — does it connect evidence to the failure mode logically?
	fix_rationale — is the fix justified by the evidence?

	Final score in inference.py:
	total = 0.85 * keyword_score + 0.15 * judge_score → always in [0.10, 0.90]
	"""

	import json

	from openai import OpenAI


	def _build_prompt(
	diagnosis: str,
	suggested_fix: str \| None,
	reasoning: str,
	scenario: dict,
	inspection_order: list[str],
	) -> str:
	seen: dict = {}
	if "logs" in inspection_order:
	seen["training_logs"] = scenario.get("logs", [])
	if "config" in inspection_order:
	seen["config"] = scenario.get("config", {})
	if "gradients" in inspection_order:
	seen["gradient_norms"] = scenario.get("gradient_norms", None)

	return f"""You are evaluating the reasoning of an ML debugging agent.

	Agent submission:
	Diagnosis: {diagnosis}
	Suggested fix: {suggested_fix or "none provided"}
	Reasoning: {reasoning}

	Data the agent had access to:
	{json.dumps(seen, indent=2)}

	Score the reasoning (integers only):
	evidence_grounding (0-5): Does the reasoning cite specific values from the data above?
	causal_chain (0-5): Does it logically connect that evidence to the diagnosed failure mode?
	fix_rationale (0-5): Is the fix directly justified by the evidence and diagnosis?

	Respond with JSON only, no explanation:
	{{"evidence_grounding": <int>, "causal_chain": <int>, "fix_rationale": <int>}}"""


	def judge(
	client: OpenAI,
	model: str,
	diagnosis: str,
	reasoning: str \| None,
	suggested_fix: str \| None,
	scenario: dict,
	inspection_order: list[str],
	) -> float \| None:
	"""Score reasoning quality. Returns 0.10–0.90, or None if unavailable/failed.

	None signals the caller to skip judge weighting entirely and use the
	keyword score at full weight (1.0) rather than 0.85.
	"""
	if not reasoning or not reasoning.strip():
	return None

	try:
	completion = client.chat.completions.create(
	model=model,
	messages=[
	{"role": "user", "content": _build_prompt(
	diagnosis, suggested_fix, reasoning, scenario, inspection_order
	)},
	],
	temperature=0.0,
	max_tokens=64,
	)
	text = (completion.choices[0].message.content or "").strip()
	data = json.loads(text)

	raw = (
	data.get("evidence_grounding", 0)
	+ data.get("causal_chain", 0)
	+ data.get("fix_rationale", 0)
	)
	# normalize: raw 0–15 → 0.10–0.90 (never below 0.10 or above 0.90)
	return round(max(0.10, min(0.90, raw / 15)), 2)

	except Exception as exc:
	print(f" [JUDGE] failed: {exc}", flush=True)
	return None