Jayant-Kernel commited on
Commit ·
3d9195a
1
Parent(s): 77e0352
fix: debug model output parsing in evaluation
Browse files- evaluate.py +13 -2
evaluate.py
CHANGED
|
@@ -61,9 +61,9 @@ SYSTEM_PROMPT = """You are answering factual questions. Respond ONLY with a JSON
|
|
| 61 |
import re
|
| 62 |
|
| 63 |
def parse_action(text):
|
| 64 |
-
|
| 65 |
try:
|
| 66 |
-
obj = json.loads(
|
| 67 |
if isinstance(obj, dict) and "reasoning" in obj:
|
| 68 |
return {
|
| 69 |
"reasoning": str(obj.get("reasoning", "")),
|
|
@@ -74,6 +74,15 @@ def parse_action(text):
|
|
| 74 |
}
|
| 75 |
except:
|
| 76 |
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
return {"reasoning":"","answer":"","confidence":0.0,"abstain":True,"is_final":True}
|
| 78 |
|
| 79 |
def evaluate_model(model_name, label, n_episodes=30, is_trained=False):
|
|
@@ -140,7 +149,9 @@ def evaluate_model(model_name, label, n_episodes=30, is_trained=False):
|
|
| 140 |
skip_special_tokens=True
|
| 141 |
)
|
| 142 |
|
|
|
|
| 143 |
parsed = parse_action(text)
|
|
|
|
| 144 |
|
| 145 |
if parsed["abstain"]:
|
| 146 |
results["abstain"] += 1
|
|
|
|
| 61 |
import re
|
| 62 |
|
| 63 |
def parse_action(text):
|
| 64 |
+
cleaned = re.sub(r"```(?:json)?\s*", "", text).strip()
|
| 65 |
try:
|
| 66 |
+
obj = json.loads(cleaned)
|
| 67 |
if isinstance(obj, dict) and "reasoning" in obj:
|
| 68 |
return {
|
| 69 |
"reasoning": str(obj.get("reasoning", "")),
|
|
|
|
| 74 |
}
|
| 75 |
except:
|
| 76 |
pass
|
| 77 |
+
# Try to extract answer from plain text patterns
|
| 78 |
+
answer = ""
|
| 79 |
+
m = re.search(r'"answer"\s*:\s*"([^"]+)"', cleaned)
|
| 80 |
+
if m:
|
| 81 |
+
answer = m.group(1)
|
| 82 |
+
elif re.search(r'\b(yes|no|true|false)\b', cleaned, re.I):
|
| 83 |
+
answer = re.search(r'\b(yes|no|true|false)\b', cleaned, re.I).group(0)
|
| 84 |
+
if answer:
|
| 85 |
+
return {"reasoning": cleaned[:100], "answer": answer, "confidence": 0.4, "abstain": False, "is_final": True}
|
| 86 |
return {"reasoning":"","answer":"","confidence":0.0,"abstain":True,"is_final":True}
|
| 87 |
|
| 88 |
def evaluate_model(model_name, label, n_episodes=30, is_trained=False):
|
|
|
|
| 149 |
skip_special_tokens=True
|
| 150 |
)
|
| 151 |
|
| 152 |
+
print(f"Model output: {text[:200]}")
|
| 153 |
parsed = parse_action(text)
|
| 154 |
+
print(f"Parsed: {parsed}")
|
| 155 |
|
| 156 |
if parsed["abstain"]:
|
| 157 |
results["abstain"] += 1
|