| """ |
| Reasoning Metrics - scores text quality across multiple dimensions. |
| |
| Each dimension is scored 0.0-1.0 using concrete textual analysis: |
| regex patterns, keyword detection, sentence structure analysis, |
| word counts, and concept density measures. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import math |
| import re |
| from collections import Counter |
| from typing import Dict, List, Optional |
|
|
|
|
| |
| |
| |
|
|
| _TRANSITION_WORDS = { |
| "therefore", "however", "moreover", "furthermore", "consequently", |
| "nevertheless", "additionally", "specifically", "thus", "hence", |
| "accordingly", "meanwhile", "similarly", "conversely", "likewise", |
| "in contrast", "on the other hand", "as a result", "for example", |
| "for instance", "in addition", "in particular", "in summary", |
| "to illustrate", "that is", "notably", "indeed", "alternatively", |
| } |
|
|
| _EXAMPLE_MARKERS = { |
| "for example", "for instance", "such as", "e.g.", "e.g.,", |
| "consider", "imagine", "suppose", "like when", "think of", |
| "analogy", "analogous", "metaphor", "illustration", "to illustrate", |
| "case in point", "picture", "envision", "scenario", |
| } |
|
|
| _PERSPECTIVE_MARKERS = { |
| "on the other hand", "from another perspective", "alternatively", |
| "some argue", "others believe", "one view", "another view", |
| "proponents", "opponents", "critics", "supporters", |
| "different perspective", "counterargument", "counter-argument", |
| "multiple perspectives", "various viewpoints", "diverse views", |
| "some scholars", "other researchers", "in contrast", |
| "conversely", "while some", "whereas others", |
| "from a … standpoint", "from the standpoint", |
| "different schools of thought", "competing theories", |
| "pluralistic", "multifaceted", |
| } |
|
|
| _SCIENTIFIC_TERMS = { |
| "hypothesis", "theory", "empirical", "variable", "correlation", |
| "causation", "experiment", "observation", "evidence", "data", |
| "quantitative", "qualitative", "statistical", "significant", |
| "methodology", "systematic", "peer-reviewed", "replicable", |
| "falsifiable", "paradigm", "model", "framework", "mechanism", |
| "phenomenon", "equation", "entropy", "quantum", "relativity", |
| "thermodynamic", "kinetic", "potential", "electromagnetic", |
| "wavelength", "frequency", "spectrum", "molecular", "cellular", |
| "neural", "cognitive", "algorithm", "computational", "stochastic", |
| "deterministic", "probabilistic", "inference", "deduction", |
| "induction", "axiom", "theorem", "coefficient", "parameter", |
| "optimization", "convergence", "divergence", "gradient", |
| "eigenvalue", "tensor", "vector", "scalar", "integral", |
| "derivative", "differential", "asymptotic", "heuristic", |
| } |
|
|
| _ETHICAL_TERMS = { |
| "ethical", "moral", "responsibility", "accountability", "fairness", |
| "justice", "bias", "harm", "benefit", "consequence", "implication", |
| "stakeholder", "rights", "duty", "obligation", "dilemma", |
| "autonomy", "consent", "privacy", "transparency", "trust", |
| "equity", "inclusion", "diversity", "sustainability", |
| "well-being", "welfare", "dignity", "integrity", "virtue", |
| "utilitarian", "deontological", "consequentialist", "normative", |
| "values", "principles", "compassion", "empathy", |
| "social impact", "unintended consequences", |
| } |
|
|
| _STRUCTURE_PATTERNS = [ |
| re.compile(r"^\s*\d+[\.\)]\s", re.MULTILINE), |
| re.compile(r"^\s*[-*]\s", re.MULTILINE), |
| re.compile(r"^#{1,4}\s", re.MULTILINE), |
| re.compile(r"\b(first|second|third|finally|lastly)\b", re.I), |
| re.compile(r"\b(step\s+\d+|phase\s+\d+)\b", re.I), |
| re.compile(r"\b(in conclusion|to summarize|in summary)\b", re.I), |
| re.compile(r"\b(introduction|background|method|result|discussion|conclusion)\b", re.I), |
| ] |
|
|
|
|
| |
| |
| |
|
|
| def _word_tokenize(text: str) -> List[str]: |
| """Simple whitespace + punctuation tokeniser.""" |
| return re.findall(r"[A-Za-z]+(?:[-'][A-Za-z]+)*", text.lower()) |
|
|
|
|
| def _sentences(text: str) -> List[str]: |
| """Split text into sentences (simple heuristic).""" |
| parts = re.split(r'(?<=[.!?])\s+', text.strip()) |
| return [s for s in parts if len(s) > 2] |
|
|
|
|
| def _unique_word_ratio(words: List[str]) -> float: |
| if not words: |
| return 0.0 |
| return len(set(words)) / len(words) |
|
|
|
|
| def _sigmoid(x: float, midpoint: float = 0.0, steepness: float = 1.0) -> float: |
| """Soft clamping via logistic function, output in (0, 1).""" |
| try: |
| return 1.0 / (1.0 + math.exp(-steepness * (x - midpoint))) |
| except OverflowError: |
| return 0.0 if x < midpoint else 1.0 |
|
|
|
|
| def _keyword_density(words: List[str], keyword_set: set) -> float: |
| """Fraction of *unique* keywords from the set that appear in words.""" |
| if not keyword_set: |
| return 0.0 |
| word_set = set(words) |
| hits = word_set & keyword_set |
| return len(hits) / len(keyword_set) |
|
|
|
|
| def _phrase_count(text: str, phrases: set) -> int: |
| """Count how many distinct phrases from *phrases* appear in text.""" |
| text_lower = text.lower() |
| return sum(1 for p in phrases if p in text_lower) |
|
|
|
|
| |
| |
| |
|
|
| class ReasoningMetrics: |
| """Score a reasoning response on multiple quality dimensions.""" |
|
|
| |
| DEFAULT_WEIGHTS: Dict[str, float] = { |
| "clarity": 0.15, |
| "structure": 0.15, |
| "depth": 0.15, |
| "examples": 0.10, |
| "multi_perspective": 0.10, |
| "scientific_rigor": 0.15, |
| "ethical_awareness": 0.10, |
| "coherence": 0.10, |
| } |
|
|
| def __init__(self, weights: Optional[Dict[str, float]] = None): |
| self.weights = weights or dict(self.DEFAULT_WEIGHTS) |
|
|
| |
|
|
| def _score_clarity(self, text: str, words: List[str], sents: List[str]) -> float: |
| """ |
| Clarity: readable sentences, moderate length, good vocabulary variety. |
| """ |
| if not sents: |
| return 0.0 |
|
|
| |
| avg_sent_len = len(words) / len(sents) |
| len_score = 1.0 - min(abs(avg_sent_len - 20) / 20, 1.0) |
|
|
| |
| diversity = _unique_word_ratio(words) |
|
|
| |
| length_penalty = min(len(words) / 50, 1.0) |
|
|
| |
| transition_count = _phrase_count(text, _TRANSITION_WORDS) |
| transition_score = min(transition_count / max(len(sents) * 0.3, 1), 1.0) |
|
|
| score = ( |
| 0.35 * len_score |
| + 0.25 * diversity |
| + 0.20 * length_penalty |
| + 0.20 * transition_score |
| ) |
| return round(min(max(score, 0.0), 1.0), 4) |
|
|
| def _score_structure(self, text: str, sents: List[str]) -> float: |
| """ |
| Structure: numbered/bulleted lists, headings, step markers, |
| paragraph breaks, logical ordering cues. |
| """ |
| if not text.strip(): |
| return 0.0 |
|
|
| pattern_hits = sum(1 for p in _STRUCTURE_PATTERNS if p.search(text)) |
| pattern_score = min(pattern_hits / 4, 1.0) |
|
|
| |
| paragraphs = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()] |
| para_score = min(len(paragraphs) / 4, 1.0) |
|
|
| |
| sent_score = min(len(sents) / 8, 1.0) |
|
|
| score = 0.50 * pattern_score + 0.25 * para_score + 0.25 * sent_score |
| return round(min(max(score, 0.0), 1.0), 4) |
|
|
| def _score_depth(self, text: str, words: List[str], sents: List[str]) -> float: |
| """ |
| Depth: word count, concept density, vocabulary richness. |
| """ |
| if not words: |
| return 0.0 |
|
|
| |
| wc_score = _sigmoid(len(words), midpoint=200, steepness=0.015) |
|
|
| |
| long_words = [w for w in words if len(w) >= 8] |
| complexity = min(len(long_words) / max(len(words) * 0.15, 1), 1.0) |
|
|
| |
| concepts = set(w for w in words if len(w) >= 3) |
| concept_density = min(len(concepts) / max(len(words) * 0.5, 1), 1.0) |
|
|
| |
| sent_depth = min(len(sents) / 10, 1.0) |
|
|
| score = ( |
| 0.30 * wc_score |
| + 0.25 * complexity |
| + 0.25 * concept_density |
| + 0.20 * sent_depth |
| ) |
| return round(min(max(score, 0.0), 1.0), 4) |
|
|
| def _score_examples(self, text: str) -> float: |
| """ |
| Examples: presence of illustrative examples, analogies, scenarios. |
| """ |
| if not text.strip(): |
| return 0.0 |
|
|
| marker_hits = _phrase_count(text, _EXAMPLE_MARKERS) |
|
|
| |
| quotes = len(re.findall(r'"[^"]{5,}"', text)) |
|
|
| |
| code_blocks = len(re.findall(r'```', text)) // 2 |
| inline_code = len(re.findall(r'`[^`]+`', text)) |
|
|
| |
| numbers = len(re.findall(r'\b\d+(?:\.\d+)?(?:\s*(?:%|kg|m|km|s|ms|Hz|J|W|N))\b', text)) |
|
|
| total_evidence = marker_hits + quotes + code_blocks + inline_code + numbers |
| score = min(total_evidence / 5, 1.0) |
| return round(min(max(score, 0.0), 1.0), 4) |
|
|
| def _score_multi_perspective(self, text: str) -> float: |
| """ |
| Multi-perspective: references to multiple viewpoints, balanced discussion. |
| """ |
| if not text.strip(): |
| return 0.0 |
|
|
| perspective_hits = _phrase_count(text, _PERSPECTIVE_MARKERS) |
|
|
| |
| hedge_words = len(re.findall( |
| r'\b(?:but|however|although|though|yet|still|nonetheless|' |
| r'notwithstanding|despite|regardless)\b', |
| text, re.I |
| )) |
|
|
| |
| questions = text.count('?') |
|
|
| total = perspective_hits * 2 + hedge_words + questions * 0.5 |
| score = min(total / 8, 1.0) |
| return round(min(max(score, 0.0), 1.0), 4) |
|
|
| def _score_scientific_rigor(self, text: str, words: List[str]) -> float: |
| """ |
| Scientific rigor: precise terminology, quantitative language, |
| references to evidence/method. |
| """ |
| if not words: |
| return 0.0 |
|
|
| sci_hits = sum(1 for w in set(words) if w in _SCIENTIFIC_TERMS) |
| term_score = min(sci_hits / 6, 1.0) |
|
|
| |
| quant = len(re.findall( |
| r'\b\d+(?:\.\d+)?(?:\s*(?:x|times|percent|%|ratio|factor))\b', |
| text, re.I |
| )) |
| quant += len(re.findall(r'[<>=]+\s*\d', text)) |
| quant_score = min(quant / 3, 1.0) |
|
|
| |
| causal = len(re.findall( |
| r'\b(?:because|caused? by|leads? to|results? in|due to|' |
| r'evidence suggests?|research shows?|studies indicate|' |
| r'according to|demonstrated|proven|measured)\b', |
| text, re.I |
| )) |
| causal_score = min(causal / 4, 1.0) |
|
|
| score = 0.45 * term_score + 0.25 * causal_score + 0.30 * quant_score |
| return round(min(max(score, 0.0), 1.0), 4) |
|
|
| def _score_ethical_awareness(self, text: str, words: List[str]) -> float: |
| """ |
| Ethical awareness: considers implications, fairness, harm, responsibility. |
| """ |
| if not words: |
| return 0.0 |
|
|
| eth_hits = sum(1 for w in set(words) if w in _ETHICAL_TERMS) |
| term_score = min(eth_hits / 4, 1.0) |
|
|
| |
| impl = len(re.findall( |
| r'\b(?:implication|consequence|impact|risk|concern|' |
| r'should|ought|must consider|raises questions|' |
| r'responsible|accountable|careful|caution)\b', |
| text, re.I |
| )) |
| impl_score = min(impl / 4, 1.0) |
|
|
| |
| stakeholder = len(re.findall( |
| r'\b(?:people|society|community|individual|user|patient|' |
| r'citizen|public|vulnerable|marginalized|affected)\b', |
| text, re.I |
| )) |
| stake_score = min(stakeholder / 3, 1.0) |
|
|
| score = 0.40 * term_score + 0.35 * impl_score + 0.25 * stake_score |
| return round(min(max(score, 0.0), 1.0), 4) |
|
|
| def _score_coherence(self, text: str, sents: List[str], words: List[str]) -> float: |
| """ |
| Coherence: adjacent sentences share vocabulary, topic consistency. |
| """ |
| if len(sents) < 2: |
| return 0.5 |
|
|
| |
| overlaps = [] |
| for i in range(len(sents) - 1): |
| w1 = set(_word_tokenize(sents[i])) |
| w2 = set(_word_tokenize(sents[i + 1])) |
| if w1 | w2: |
| overlaps.append(len(w1 & w2) / len(w1 | w2)) |
| else: |
| overlaps.append(0.0) |
| avg_overlap = sum(overlaps) / len(overlaps) if overlaps else 0.0 |
| |
| overlap_score = 1.0 - abs(avg_overlap - 0.2) / 0.4 |
| overlap_score = max(overlap_score, 0.0) |
|
|
| |
| pronoun_count = len(re.findall( |
| r'\b(?:this|that|these|those|it|they|its|their|such|said)\b', |
| text, re.I |
| )) |
| ref_score = min(pronoun_count / max(len(sents), 1) / 1.5, 1.0) |
|
|
| score = 0.60 * overlap_score + 0.40 * ref_score |
| return round(min(max(score, 0.0), 1.0), 4) |
|
|
| |
|
|
| def score_reasoning(self, text: str) -> Dict[str, float]: |
| """Score a reasoning response on multiple dimensions. |
| |
| Returns dict with scores 0.0-1.0 for: |
| - clarity, structure, depth, examples, multi_perspective, |
| scientific_rigor, ethical_awareness, coherence, overall |
| """ |
| words = _word_tokenize(text) |
| sents = _sentences(text) |
|
|
| scores: Dict[str, float] = { |
| "clarity": self._score_clarity(text, words, sents), |
| "structure": self._score_structure(text, sents), |
| "depth": self._score_depth(text, words, sents), |
| "examples": self._score_examples(text), |
| "multi_perspective": self._score_multi_perspective(text), |
| "scientific_rigor": self._score_scientific_rigor(text, words), |
| "ethical_awareness": self._score_ethical_awareness(text, words), |
| "coherence": self._score_coherence(text, sents, words), |
| } |
|
|
| |
| total_weight = sum(self.weights.get(k, 0) for k in scores) |
| if total_weight > 0: |
| overall = sum( |
| scores[k] * self.weights.get(k, 0) for k in scores |
| ) / total_weight |
| else: |
| overall = sum(scores.values()) / len(scores) |
|
|
| scores["overall"] = round(overall, 4) |
| scores["word_count"] = len(words) |
| scores["sentence_count"] = len(sents) |
| return scores |
|
|
| def score_batch(self, texts: List[str]) -> List[Dict[str, float]]: |
| """Score a batch of responses.""" |
| return [self.score_reasoning(t) for t in texts] |
|
|
| def compare(self, text_a: str, text_b: str) -> Dict[str, Dict[str, float]]: |
| """Compare two responses side-by-side.""" |
| sa = self.score_reasoning(text_a) |
| sb = self.score_reasoning(text_b) |
| delta = {k: round(sb[k] - sa[k], 4) for k in sa if isinstance(sa[k], (int, float))} |
| return {"baseline": sa, "candidate": sb, "delta": delta} |
|
|