Spaces:
Sleeping
Sleeping
| """ | |
| ECHO ULTIMATE β Robust <confidence><answer> parser. | |
| Handles 15+ edge cases. NEVER crashes. Always returns a ParseResult. | |
| """ | |
| import re | |
| import logging | |
| from dataclasses import dataclass, field | |
| from typing import Optional | |
| logger = logging.getLogger(__name__) | |
| # ββ Regex patterns ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _CONF_TAG_RE = re.compile(r"<confidence>\s*([^<]*?)\s*</confidence>", re.IGNORECASE | re.DOTALL) | |
| _ANS_TAG_RE = re.compile(r"<answer>\s*(.*?)\s*</answer>", re.IGNORECASE | re.DOTALL) | |
| _NUM_RE = re.compile(r"-?\d+(?:\.\d+)?") | |
| _QUOTES_RE = re.compile(r'^["\'](.+)["\']$', re.DOTALL) | |
| # Verbal confidence map | |
| _VERBAL_MAP = { | |
| "very sure": 90, "very certain": 90, "extremely sure": 95, "absolutely sure": 98, | |
| "certain": 88, "confident": 78, "sure": 75, "fairly sure": 70, | |
| "somewhat sure": 60, "unsure": 35, "uncertain": 30, "not sure": 25, | |
| "very unsure": 15, "very uncertain": 15, "no idea": 5, "no clue": 5, | |
| "high": 85, "medium": 50, "low": 25, "moderate": 55, | |
| "probably": 65, "likely": 65, "unlikely": 30, "doubtful": 20, | |
| } | |
| DEFAULT_CONFIDENCE = 50 | |
| class ParseResult: | |
| """Result of parsing one LLM response.""" | |
| confidence: int = DEFAULT_CONFIDENCE | |
| answer: str = "" | |
| parse_success: bool = False | |
| confidence_source: str = "default" # "tag"|"default"|"clipped"|"inferred"|"verbal" | |
| answer_source: str = "empty" # "tag"|"last_sentence"|"full_text"|"empty" | |
| is_abstention: bool = False # True if answer is "I don't know" | |
| raw: str = "" | |
| # ββ Confidence extraction βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _extract_confidence(text: str) -> tuple[int, str]: | |
| """Return (confidence_int, source_label). Never raises.""" | |
| matches = _CONF_TAG_RE.findall(text) | |
| if not matches: | |
| return DEFAULT_CONFIDENCE, "default" | |
| raw = matches[0].strip() # use first match only (edge case 8) | |
| if not raw: | |
| return DEFAULT_CONFIDENCE, "default" | |
| # Edge case 6: verbal confidence | |
| raw_lower = raw.lower() | |
| for phrase, val in _VERBAL_MAP.items(): | |
| if phrase in raw_lower: | |
| return val, "verbal" | |
| # Edge case 7 + 10 + 11: float / out-of-range number | |
| nums = _NUM_RE.findall(raw.replace(",", "")) | |
| if nums: | |
| try: | |
| val = round(float(nums[0])) | |
| clipped = max(0, min(100, val)) | |
| source = "clipped" if clipped != val else "tag" | |
| return clipped, source | |
| except ValueError: | |
| pass | |
| return DEFAULT_CONFIDENCE, "default" | |
| # ββ Answer extraction βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _extract_answer(text: str) -> tuple[str, str]: | |
| """Return (answer_str, source_label). Never raises.""" | |
| matches = _ANS_TAG_RE.findall(text) | |
| if matches: | |
| raw_ans = matches[0].strip() | |
| # Edge case 13: strip surrounding quotes | |
| m = _QUOTES_RE.match(raw_ans) | |
| if m: | |
| raw_ans = m.group(1).strip() | |
| return raw_ans, "tag" | |
| # No answer tag β fall back to text after </confidence> | |
| after_conf = re.split(r"</confidence>", text, flags=re.IGNORECASE, maxsplit=1) | |
| if len(after_conf) > 1: | |
| tail = after_conf[1].strip() | |
| # Remove any remaining tags | |
| tail = re.sub(r"<[^>]+>", " ", tail).strip() | |
| if tail: | |
| return tail, "full_text" | |
| # Last sentence fallback | |
| clean = re.sub(r"<[^>]+>.*?</[^>]+>", " ", text, flags=re.DOTALL) | |
| clean = re.sub(r"<[^>]+>", " ", clean).strip() | |
| sentences = [s.strip() for s in re.split(r"[.!?]", clean) if s.strip()] | |
| if sentences: | |
| return sentences[-1], "last_sentence" | |
| return "", "empty" | |
| # ββ Main parse function βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def parse_response(text) -> ParseResult: | |
| """ | |
| Parse an LLM response into confidence and answer. | |
| Handles edge cases: | |
| 1. Perfect format | |
| 2. Reversed tags | |
| 3. No confidence tag β default 50 | |
| 4. No answer tag β extract from remaining text | |
| 5. Confidence out of range β clip to [0,100] | |
| 6. Verbal confidence ("high", "low", "very sure") β mapped to int | |
| 7. Float confidence β rounded | |
| 8. Multiple tags β first occurrence | |
| 9. Nested tags β regex extracts correctly | |
| 10. Confidence > 100 β clipped to 100 | |
| 11. Negative confidence β clipped to 0 | |
| 12. Empty answer β empty string | |
| 13. Answer with quotes β stripped | |
| 14. "I don't know" β is_abstention=True, confidence=5 | |
| 15. None / non-string input β safe defaults | |
| """ | |
| if text is None: | |
| return ParseResult(raw="") | |
| if not isinstance(text, str): | |
| try: | |
| text = str(text) | |
| except Exception: | |
| return ParseResult(raw="") | |
| conf, conf_src = _extract_confidence(text) | |
| ans, ans_src = _extract_answer(text) | |
| # Edge case 14: abstention detection | |
| is_abstention = False | |
| if ans and any(phrase in ans.lower() for phrase in | |
| ["i don't know", "i do not know", "i'm not sure", "no idea", "don't know"]): | |
| is_abstention = True | |
| conf = min(conf, 10) | |
| conf_src = "inferred" | |
| parse_success = (conf_src == "tag" or conf_src == "verbal") and ans_src == "tag" | |
| return ParseResult( | |
| confidence=conf, | |
| answer=ans, | |
| parse_success=parse_success, | |
| confidence_source=conf_src, | |
| answer_source=ans_src, | |
| is_abstention=is_abstention, | |
| raw=text, | |
| ) | |
| # ββ Prompt formatting βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def format_prompt( | |
| question: str, | |
| domain: str, | |
| difficulty: str = "medium", | |
| show_difficulty: bool = True, | |
| ) -> str: | |
| """ | |
| Build a formatted prompt combining the system instruction + question. | |
| Args: | |
| show_difficulty: Phase 1 shows difficulty; Phase 2+ hides it. | |
| """ | |
| from config import cfg | |
| domain_hints = { | |
| "math": "This is a math problem. Give a numeric answer.", | |
| "logic": "This is a logic/reasoning question. Give the letter (A/B/C/D).", | |
| "factual": "This is a factual question. Give a concise text answer.", | |
| "science": "This is a science question. Give the letter or a concise answer.", | |
| "medical": "This is a medical question. Give the letter (A/B/C/D).", | |
| "coding": "This is a coding question. Give a concise answer.", | |
| "creative": "This is a creative question. Give a short text answer.", | |
| } | |
| hint = domain_hints.get(domain, "Give a concise answer.") | |
| diff_str = f" [{difficulty.upper()}]" if show_difficulty else "" | |
| header = f"Domain: {domain.capitalize()}{diff_str}\n{hint}\n\n" | |
| return f"{cfg.SYSTEM_PROMPT}\n\n{header}Question: {question}" | |
| # ββ Self-tests ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if __name__ == "__main__": | |
| failures = [] | |
| def check(text, exp_conf, exp_ans, label, exp_abst=False): | |
| r = parse_response(text) | |
| ok = True | |
| if exp_conf is not None and r.confidence != exp_conf: | |
| failures.append(f"[{label}] confidence: expected {exp_conf}, got {r.confidence}") | |
| ok = False | |
| if exp_ans is not None and r.answer != exp_ans: | |
| failures.append(f"[{label}] answer: expected '{exp_ans}', got '{r.answer}'") | |
| ok = False | |
| if r.is_abstention != exp_abst: | |
| failures.append(f"[{label}] is_abstention: expected {exp_abst}, got {r.is_abstention}") | |
| ok = False | |
| if ok: | |
| print(f" β {label}") | |
| print("Running ECHO Ultimate parser testsβ¦") | |
| check("<confidence>75</confidence><answer>Paris</answer>", 75, "Paris", "1. perfect format") | |
| check("<answer>Paris</answer><confidence>75</confidence>", 75, "Paris", "2. reversed tags") | |
| check("<answer>London</answer>", DEFAULT_CONFIDENCE, "London", "3. no confidence tag") | |
| check("<confidence>55</confidence>", 55, None, "4. no answer tag") | |
| check("<confidence>150</confidence><answer>x</answer>", 100, "x", "5. confidence clipped high") | |
| check("<confidence>high</confidence><answer>Paris</answer>", 85, "Paris", "6. verbal 'high'") | |
| check("<confidence>very sure</confidence><answer>yes</answer>", 90, "yes", "6b. verbal 'very sure'") | |
| check("<confidence>73.6</confidence><answer>42</answer>", 74, "42", "7. float confidence") | |
| check("<confidence>80</confidence><answer>A</answer><confidence>30</confidence>", 80, "A", "8. multiple tags") | |
| check("<confidence>95</confidence><answer>Rome</answer>", 95, "Rome", "9. normal nested") | |
| check("<confidence>200</confidence><answer>x</answer>", 100, "x", "10. > 100 clipped") | |
| check("<confidence>-5</confidence><answer>x</answer>", 0, "x", "11. negative clipped") | |
| check("<confidence>50</confidence><answer></answer>", 50, "", "12. empty answer") | |
| check('<confidence>70</confidence><answer>"Paris"</answer>', 70, "Paris", "13. quoted answer") | |
| r14 = parse_response("<confidence>80</confidence><answer>I don't know</answer>") | |
| assert r14.is_abstention, "14. abstention flag" | |
| assert r14.confidence <= 10, "14. abstention confidence" | |
| print(" β 14. I don't know β abstention=True, confβ€10") | |
| check(None, DEFAULT_CONFIDENCE, "", "15. None input") | |
| check(42, DEFAULT_CONFIDENCE, None, "15b. int input") | |
| check("", DEFAULT_CONFIDENCE, "", "15c. empty string") | |
| check(" <confidence> 60 </confidence> <answer> Berlin </answer> ", 60, "Berlin", "whitespace trimmed") | |
| check("<CONFIDENCE>80</CONFIDENCE><ANSWER>Rome</ANSWER>", 80, "Rome", "uppercase tags") | |
| check("<confidence>50</confidence><answer>The Eiffel Tower</answer>", 50, "The Eiffel Tower", "multi-word answer") | |
| if failures: | |
| print("\nβ FAILURES:") | |
| for f in failures: | |
| print(f" {f}") | |
| else: | |
| print("\nβ All parser tests passed.") | |