Jayant-Kernel commited on
Commit ·
66bdd16
1
Parent(s): 3d9195a
fix: parse_action confidence bug, numeric answers bug, missing reasoning field bug
Browse files- evaluate.py +17 -16
evaluate.py
CHANGED
|
@@ -61,28 +61,29 @@ SYSTEM_PROMPT = """You are answering factual questions. Respond ONLY with a JSON
|
|
| 61 |
import re
|
| 62 |
|
| 63 |
def parse_action(text):
|
| 64 |
-
|
|
|
|
| 65 |
try:
|
| 66 |
-
obj = json.loads(
|
| 67 |
-
if isinstance(obj, dict) and "reasoning" in obj:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
return {
|
| 69 |
"reasoning": str(obj.get("reasoning", "")),
|
| 70 |
-
"answer":
|
| 71 |
-
"confidence":
|
| 72 |
"abstain": bool(obj.get("abstain", False)),
|
| 73 |
"is_final": bool(obj.get("is_final", True)),
|
| 74 |
}
|
| 75 |
-
except:
|
| 76 |
-
|
| 77 |
-
# Try to extract answer from plain text patterns
|
| 78 |
-
answer = ""
|
| 79 |
-
m = re.search(r'"answer"\s*:\s*"([^"]+)"', cleaned)
|
| 80 |
-
if m:
|
| 81 |
-
answer = m.group(1)
|
| 82 |
-
elif re.search(r'\b(yes|no|true|false)\b', cleaned, re.I):
|
| 83 |
-
answer = re.search(r'\b(yes|no|true|false)\b', cleaned, re.I).group(0)
|
| 84 |
-
if answer:
|
| 85 |
-
return {"reasoning": cleaned[:100], "answer": answer, "confidence": 0.4, "abstain": False, "is_final": True}
|
| 86 |
return {"reasoning":"","answer":"","confidence":0.0,"abstain":True,"is_final":True}
|
| 87 |
|
| 88 |
def evaluate_model(model_name, label, n_episodes=30, is_trained=False):
|
|
|
|
| 61 |
import re
|
| 62 |
|
| 63 |
def parse_action(text):
|
| 64 |
+
text = re.sub(r"```(?:json)?\s*", "", text).strip()
|
| 65 |
+
text = re.sub(r"```\s*$", "", text).strip()
|
| 66 |
try:
|
| 67 |
+
obj = json.loads(text)
|
| 68 |
+
if isinstance(obj, dict) and ("answer" in obj or "reasoning" in obj):
|
| 69 |
+
answer = obj.get("answer", "")
|
| 70 |
+
if isinstance(answer, (int, float)):
|
| 71 |
+
answer = str(answer)
|
| 72 |
+
else:
|
| 73 |
+
answer = str(answer)
|
| 74 |
+
|
| 75 |
+
confidence = float(obj.get("confidence", 0.5))
|
| 76 |
+
confidence = max(0.0, min(1.0, confidence))
|
| 77 |
+
|
| 78 |
return {
|
| 79 |
"reasoning": str(obj.get("reasoning", "")),
|
| 80 |
+
"answer": answer,
|
| 81 |
+
"confidence": confidence,
|
| 82 |
"abstain": bool(obj.get("abstain", False)),
|
| 83 |
"is_final": bool(obj.get("is_final", True)),
|
| 84 |
}
|
| 85 |
+
except Exception as e:
|
| 86 |
+
print(f"Parse error: {e}, text: {text[:100]}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
return {"reasoning":"","answer":"","confidence":0.0,"abstain":True,"is_final":True}
|
| 88 |
|
| 89 |
def evaluate_model(model_name, label, n_episodes=30, is_trained=False):
|