Spaces:

j-js
/

TradingGameAI

Sleeping

App Files Files Community

j-js commited on Mar 13

Commit

e7c7270

verified ·

1 Parent(s): 3a8ed9f

Create math_normalizer.py

Browse files

Files changed (1) hide show

math_normalizer.py +357 -0

math_normalizer.py ADDED Viewed

	@@ -0,0 +1,357 @@

+from __future__ import annotations
+import re
+import unicodedata
+from typing import Dict
+SYMBOL_REPLACEMENTS: Dict[str, str] = {
+    # Equality / comparison
+    "=": "=",
+    "≠": " !=",          # keep spaced form easy to regex
+    "≈": " approx ",
+    "~": " approx ",
+    "≡": " equivalent ",
+    "≜": " = ",
+    ":=": " = ",
+    ">": " > ",
+    "<": " < ",
+    "≥": " >= ",
+    "≤": " <= ",
+    "≪": " << ",
+    "≫": " >> ",
+    # Arithmetic operators
+    "+": " + ",
+    "−": " - ",
+    "–": " - ",
+    "—": " - ",
+    "-": " - ",
+    "‒": " - ",
+    "±": " plus_minus ",
+    "∓": " minus_plus ",
+    "*": " * ",
+    "×": " * ",
+    "⋅": " * ",
+    "·": " * ",
+    "÷": " / ",
+    "/": " / ",
+    "∕": " / ",
+    "⁄": " / ",
+    # Brackets / grouping
+    "[": "(",
+    "]": ")",
+    "{": " { ",
+    "}": " } ",
+    "⌊": " floor(",
+    "⌋": ")",
+    "⌈": " ceil(",
+    "⌉": ")",
+    # Powers / roots
+    "^": "^",
+    "²": "^2",
+    "³": "^3",
+    "⁴": "^4",
+    "⁵": "^5",
+    "⁶": "^6",
+    "⁷": "^7",
+    "⁸": "^8",
+    "⁹": "^9",
+    "⁰": "^0",
+    "¹": "^1",
+    "√": " sqrt ",
+    "∛": " cbrt ",
+    "∜": " fourth_root ",
+    # Percent / rates
+    "%": " percent ",
+    "‰": " permille ",
+    "‱": " permyriad ",
+    # Geometry
+    "∠": " angle ",
+    "∟": " right_angle ",
+    "°": " degrees ",
+    "′": " prime ",
+    "″": " double_prime ",
+    "⊥": " perpendicular ",
+    "∥": " parallel ",
+    "≅": " congruent ",
+    "Δ": " triangle ",
+    "△": " triangle ",
+    "π": " pi ",
+    # Algebra / calculus-ish
+    "∞": " infinity ",
+    "∝": " proportional_to ",
+    "∆": " delta ",
+    "∑": " sum ",
+    "∏": " product ",
+    "∫": " integral ",
+    # Probability / sets
+    "∩": " intersection ",
+    "∪": " union ",
+    "⊆": " subseteq ",
+    "⊂": " subset ",
+    "∈": " in ",
+    "∉": " not_in ",
+    "∅": " empty_set ",
+    "|": " | ",
+    # Common OCR / typography junk
+    "“": '"',
+    "”": '"',
+    "‘": "'",
+    "’": "'",
+    "…": "...",
+    "\u00a0": " ",   # non-breaking space
+}
+TEXT_REPLACEMENTS: Dict[str, str] = {
+    # Verbal math phrases -> more parseable forms
+    "divided by": " / ",
+    "multiplied by": " * ",
+    "times": " * ",
+    "plus": " + ",
+    "minus": " - ",
+    "equals": " = ",
+    "is equal to": " = ",
+    "is greater than or equal to": " >= ",
+    "is less than or equal to": " <= ",
+    "greater than or equal to": " >= ",
+    "less than or equal to": " <= ",
+    "greater than": " > ",
+    "less than": " < ",
+    "not equal to": " != ",
+    "approximately equal to": " approx ",
+    "approx equal to": " approx ",
+    "squared": "^2",
+    "cubed": "^3",
+    "square root of": " sqrt ",
+    "cube root of": " cbrt ",
+    "to the power of": "^",
+    "raised to the power of": "^",
+    "percent": " percent ",
+    "per cent": " percent ",
+    "percentage": " percent ",
+    "remainder when": " remainder ",
+    "is divisible by": " divisible_by ",
+    "divisible by": " divisible_by ",
+    "is a multiple of": " multiple_of ",
+    "multiple of": " multiple_of ",
+    "factor of": " factor_of ",
+    "prime number": " prime ",
+    "consecutive integers": " consecutive_integers ",
+    "positive integer": " positive_integer ",
+    "negative integer": " negative_integer ",
+    "at least": " >= ",
+    "at most": " <= ",
+    "no more than": " <= ",
+    "no less than": " >= ",
+    "more than": " > ",
+    "fewer than": " < ",
+    "probability of": " probability ",
+    "mean": " mean ",
+    "average": " average ",
+    "median": " median ",
+    "mode": " mode ",
+    "standard deviation": " standard_deviation ",
+    "variance": " variance ",
+    "perimeter": " perimeter ",
+    "area": " area ",
+    "volume": " volume ",
+    "circumference": " circumference ",
+    "radius": " radius ",
+    "diameter": " diameter ",
+    "ratio of": " ratio ",
+    "ratio": " ratio ",
+    "proportion": " proportion ",
+    "sum of": " sum ",
+    "difference between": " difference ",
+    "product of": " product ",
+    "quotient of": " quotient ",
+}
+UNICODE_FRACTIONS: Dict[str, str] = {
+    "½": "1/2",
+    "⅓": "1/3",
+    "⅔": "2/3",
+    "¼": "1/4",
+    "¾": "3/4",
+    "⅕": "1/5",
+    "⅖": "2/5",
+    "⅗": "3/5",
+    "⅘": "4/5",
+    "⅙": "1/6",
+    "⅚": "5/6",
+    "⅐": "1/7",
+    "⅛": "1/8",
+    "⅜": "3/8",
+    "⅝": "5/8",
+    "⅞": "7/8",
+    "⅑": "1/9",
+    "⅒": "1/10",
+}
+SUPERSCRIPT_MAP: Dict[str, str] = {
+    "⁰": "0",
+    "¹": "1",
+    "²": "2",
+    "³": "3",
+    "⁴": "4",
+    "⁵": "5",
+    "⁶": "6",
+    "⁷": "7",
+    "⁸": "8",
+    "⁹": "9",
+    "⁺": "+",
+    "⁻": "-",
+}
+SUBSCRIPT_MAP: Dict[str, str] = {
+    "₀": "0",
+    "₁": "1",
+    "₂": "2",
+    "₃": "3",
+    "₄": "4",
+    "₅": "5",
+    "₆": "6",
+    "₇": "7",
+    "₈": "8",
+    "₉": "9",
+    "₊": "+",
+    "₋": "-",
+}
+def _replace_unicode_fractions(text: str) -> str:
+    for k, v in UNICODE_FRACTIONS.items():
+        text = text.replace(k, v)
+    return text
+def _replace_superscripts_and_subscripts(text: str) -> str:
+    out = []
+    i = 0
+    while i < len(text):
+        ch = text[i]
+        if ch in SUPERSCRIPT_MAP:
+            digits = []
+            while i < len(text) and text[i] in SUPERSCRIPT_MAP:
+                digits.append(SUPERSCRIPT_MAP[text[i]])
+                i += 1
+            out.append("^" + "".join(digits))
+            continue
+        if ch in SUBSCRIPT_MAP:
+            digits = []
+            while i < len(text) and text[i] in SUBSCRIPT_MAP:
+                digits.append(SUBSCRIPT_MAP[text[i]])
+                i += 1
+            out.append("_" + "".join(digits))
+            continue
+        out.append(ch)
+        i += 1
+    return "".join(out)
+def _replace_symbol_chars(text: str) -> str:
+    for k, v in SYMBOL_REPLACEMENTS.items():
+        text = text.replace(k, v)
+    return text
+def _replace_text_phrases(text: str) -> str:
+    # longest first so "greater than or equal to" is replaced before "greater than"
+    for k in sorted(TEXT_REPLACEMENTS.keys(), key=len, reverse=True):
+        text = re.sub(rf"\b{re.escape(k)}\b", TEXT_REPLACEMENTS[k], text, flags=re.I)
+    return text
+def _normalize_roots(text: str) -> str:
+    # "sqrt 9" -> "sqrt(9)"
+    text = re.sub(r"\bsqrt\s+([a-z0-9\(\)\/\+\-\*\.]+)", r"sqrt(\1)", text, flags=re.I)
+    text = re.sub(r"\bcbrt\s+([a-z0-9\(\)\/\+\-\*\.]+)", r"cbrt(\1)", text, flags=re.I)
+    return text
+def _normalize_percent_expressions(text: str) -> str:
+    # "25 percent of 80" -> "(25/100) * 80"
+    text = re.sub(
+        r"(\d+(?:\.\d+)?)\s*percent\s+of\s+(\d+(?:\.\d+)?)",
+        r"(\1/100) * \2",
+        text,
+        flags=re.I,
+    )
+    # "x percent" -> "(x/100)"
+    text = re.sub(
+        r"(\d+(?:\.\d+)?)\s*percent\b",
+        r"(\1/100)",
+        text,
+        flags=re.I,
+    )
+    # per-mille
+    text = re.sub(
+        r"(\d+(?:\.\d+)?)\s*permille\b",
+        r"(\1/1000)",
+        text,
+        flags=re.I,
+    )
+    return text
+def _normalize_multiplication_spacing(text: str) -> str:
+    # 5x -> 5*x
+    text = re.sub(r"(\d)([a-zA-Z])", r"\1*\2", text)
+    # )x -> )*x
+    text = re.sub(r"(\))([a-zA-Z0-9])", r"\1*\2", text)
+    # x( -> x*(
+    text = re.sub(r"([a-zA-Z0-9])(\()", r"\1*\2", text)
+    return text
+def normalize_math_text(text: str) -> str:
+    if not text:
+        return ""
+    text = unicodedata.normalize("NFKC", text)
+    text = _replace_unicode_fractions(text)
+    text = _replace_superscripts_and_subscripts(text)
+    text = _replace_symbol_chars(text)
+    text = _replace_text_phrases(text)
+    text = _normalize_roots(text)
+    text = _normalize_percent_expressions(text)
+    text = _normalize_multiplication_spacing(text)
+    # normalize repeated spaces
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+def normalize_for_solver(text: str) -> str:
+    text = normalize_math_text(text)
+    # make some solver-oriented aliases
+    text = text.replace("pi", "3.141592653589793")
+    text = text.replace("approx", "~")
+    return text
+def normalize_for_parser(text: str) -> str:
+    text = normalize_math_text(text)
+    # keep semantic tokens for router/parser
+    return text