j-js commited on
Commit
e7c7270
Β·
verified Β·
1 Parent(s): 3a8ed9f

Create math_normalizer.py

Browse files
Files changed (1) hide show
  1. math_normalizer.py +357 -0
math_normalizer.py ADDED
@@ -0,0 +1,357 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ import unicodedata
5
+ from typing import Dict
6
+
7
+
8
+ SYMBOL_REPLACEMENTS: Dict[str, str] = {
9
+ # Equality / comparison
10
+ "=": "=",
11
+ "β‰ ": " !=", # keep spaced form easy to regex
12
+ "β‰ˆ": " approx ",
13
+ "~": " approx ",
14
+ "≑": " equivalent ",
15
+ "β‰œ": " = ",
16
+ ":=": " = ",
17
+ ">": " > ",
18
+ "<": " < ",
19
+ "β‰₯": " >= ",
20
+ "≀": " <= ",
21
+ "β‰ͺ": " << ",
22
+ "≫": " >> ",
23
+
24
+ # Arithmetic operators
25
+ "+": " + ",
26
+ "βˆ’": " - ",
27
+ "–": " - ",
28
+ "β€”": " - ",
29
+ "-": " - ",
30
+ "β€’": " - ",
31
+ "Β±": " plus_minus ",
32
+ "βˆ“": " minus_plus ",
33
+ "*": " * ",
34
+ "Γ—": " * ",
35
+ "β‹…": " * ",
36
+ "Β·": " * ",
37
+ "Γ·": " / ",
38
+ "/": " / ",
39
+ "βˆ•": " / ",
40
+ "⁄": " / ",
41
+
42
+ # Brackets / grouping
43
+ "[": "(",
44
+ "]": ")",
45
+ "{": " { ",
46
+ "}": " } ",
47
+ "⌊": " floor(",
48
+ "βŒ‹": ")",
49
+ "⌈": " ceil(",
50
+ "βŒ‰": ")",
51
+
52
+ # Powers / roots
53
+ "^": "^",
54
+ "Β²": "^2",
55
+ "Β³": "^3",
56
+ "⁴": "^4",
57
+ "⁡": "^5",
58
+ "⁢": "^6",
59
+ "⁷": "^7",
60
+ "⁸": "^8",
61
+ "⁹": "^9",
62
+ "⁰": "^0",
63
+ "ΒΉ": "^1",
64
+ "√": " sqrt ",
65
+ "βˆ›": " cbrt ",
66
+ "∜": " fourth_root ",
67
+
68
+ # Percent / rates
69
+ "%": " percent ",
70
+ "‰": " permille ",
71
+ "β€±": " permyriad ",
72
+
73
+ # Geometry
74
+ "∠": " angle ",
75
+ "∟": " right_angle ",
76
+ "Β°": " degrees ",
77
+ "β€²": " prime ",
78
+ "β€³": " double_prime ",
79
+ "βŠ₯": " perpendicular ",
80
+ "βˆ₯": " parallel ",
81
+ "β‰…": " congruent ",
82
+ "Ξ”": " triangle ",
83
+ "β–³": " triangle ",
84
+ "Ο€": " pi ",
85
+
86
+ # Algebra / calculus-ish
87
+ "∞": " infinity ",
88
+ "∝": " proportional_to ",
89
+ "βˆ†": " delta ",
90
+ "βˆ‘": " sum ",
91
+ "∏": " product ",
92
+ "∫": " integral ",
93
+
94
+ # Probability / sets
95
+ "∩": " intersection ",
96
+ "βˆͺ": " union ",
97
+ "βŠ†": " subseteq ",
98
+ "βŠ‚": " subset ",
99
+ "∈": " in ",
100
+ "βˆ‰": " not_in ",
101
+ "βˆ…": " empty_set ",
102
+ "|": " | ",
103
+
104
+ # Common OCR / typography junk
105
+ "β€œ": '"',
106
+ "”": '"',
107
+ "β€˜": "'",
108
+ "’": "'",
109
+ "…": "...",
110
+ "\u00a0": " ", # non-breaking space
111
+ }
112
+
113
+
114
+ TEXT_REPLACEMENTS: Dict[str, str] = {
115
+ # Verbal math phrases -> more parseable forms
116
+ "divided by": " / ",
117
+ "multiplied by": " * ",
118
+ "times": " * ",
119
+ "plus": " + ",
120
+ "minus": " - ",
121
+ "equals": " = ",
122
+ "is equal to": " = ",
123
+ "is greater than or equal to": " >= ",
124
+ "is less than or equal to": " <= ",
125
+ "greater than or equal to": " >= ",
126
+ "less than or equal to": " <= ",
127
+ "greater than": " > ",
128
+ "less than": " < ",
129
+ "not equal to": " != ",
130
+ "approximately equal to": " approx ",
131
+ "approx equal to": " approx ",
132
+ "squared": "^2",
133
+ "cubed": "^3",
134
+ "square root of": " sqrt ",
135
+ "cube root of": " cbrt ",
136
+ "to the power of": "^",
137
+ "raised to the power of": "^",
138
+ "percent": " percent ",
139
+ "per cent": " percent ",
140
+ "percentage": " percent ",
141
+ "remainder when": " remainder ",
142
+ "is divisible by": " divisible_by ",
143
+ "divisible by": " divisible_by ",
144
+ "is a multiple of": " multiple_of ",
145
+ "multiple of": " multiple_of ",
146
+ "factor of": " factor_of ",
147
+ "prime number": " prime ",
148
+ "consecutive integers": " consecutive_integers ",
149
+ "positive integer": " positive_integer ",
150
+ "negative integer": " negative_integer ",
151
+ "at least": " >= ",
152
+ "at most": " <= ",
153
+ "no more than": " <= ",
154
+ "no less than": " >= ",
155
+ "more than": " > ",
156
+ "fewer than": " < ",
157
+ "probability of": " probability ",
158
+ "mean": " mean ",
159
+ "average": " average ",
160
+ "median": " median ",
161
+ "mode": " mode ",
162
+ "standard deviation": " standard_deviation ",
163
+ "variance": " variance ",
164
+ "perimeter": " perimeter ",
165
+ "area": " area ",
166
+ "volume": " volume ",
167
+ "circumference": " circumference ",
168
+ "radius": " radius ",
169
+ "diameter": " diameter ",
170
+ "ratio of": " ratio ",
171
+ "ratio": " ratio ",
172
+ "proportion": " proportion ",
173
+ "sum of": " sum ",
174
+ "difference between": " difference ",
175
+ "product of": " product ",
176
+ "quotient of": " quotient ",
177
+ }
178
+
179
+
180
+ UNICODE_FRACTIONS: Dict[str, str] = {
181
+ "Β½": "1/2",
182
+ "β…“": "1/3",
183
+ "β…”": "2/3",
184
+ "ΒΌ": "1/4",
185
+ "ΒΎ": "3/4",
186
+ "β…•": "1/5",
187
+ "β…–": "2/5",
188
+ "β…—": "3/5",
189
+ "β…˜": "4/5",
190
+ "β…™": "1/6",
191
+ "β…š": "5/6",
192
+ "⅐": "1/7",
193
+ "β…›": "1/8",
194
+ "β…œ": "3/8",
195
+ "⅝": "5/8",
196
+ "β…ž": "7/8",
197
+ "β…‘": "1/9",
198
+ "β…’": "1/10",
199
+ }
200
+
201
+
202
+ SUPERSCRIPT_MAP: Dict[str, str] = {
203
+ "⁰": "0",
204
+ "ΒΉ": "1",
205
+ "Β²": "2",
206
+ "Β³": "3",
207
+ "⁴": "4",
208
+ "⁡": "5",
209
+ "⁢": "6",
210
+ "⁷": "7",
211
+ "⁸": "8",
212
+ "⁹": "9",
213
+ "⁺": "+",
214
+ "⁻": "-",
215
+ }
216
+
217
+ SUBSCRIPT_MAP: Dict[str, str] = {
218
+ "β‚€": "0",
219
+ "₁": "1",
220
+ "β‚‚": "2",
221
+ "₃": "3",
222
+ "β‚„": "4",
223
+ "β‚…": "5",
224
+ "₆": "6",
225
+ "₇": "7",
226
+ "β‚ˆ": "8",
227
+ "₉": "9",
228
+ "β‚Š": "+",
229
+ "β‚‹": "-",
230
+ }
231
+
232
+
233
+ def _replace_unicode_fractions(text: str) -> str:
234
+ for k, v in UNICODE_FRACTIONS.items():
235
+ text = text.replace(k, v)
236
+ return text
237
+
238
+
239
+ def _replace_superscripts_and_subscripts(text: str) -> str:
240
+ out = []
241
+ i = 0
242
+ while i < len(text):
243
+ ch = text[i]
244
+
245
+ if ch in SUPERSCRIPT_MAP:
246
+ digits = []
247
+ while i < len(text) and text[i] in SUPERSCRIPT_MAP:
248
+ digits.append(SUPERSCRIPT_MAP[text[i]])
249
+ i += 1
250
+ out.append("^" + "".join(digits))
251
+ continue
252
+
253
+ if ch in SUBSCRIPT_MAP:
254
+ digits = []
255
+ while i < len(text) and text[i] in SUBSCRIPT_MAP:
256
+ digits.append(SUBSCRIPT_MAP[text[i]])
257
+ i += 1
258
+ out.append("_" + "".join(digits))
259
+ continue
260
+
261
+ out.append(ch)
262
+ i += 1
263
+
264
+ return "".join(out)
265
+
266
+
267
+ def _replace_symbol_chars(text: str) -> str:
268
+ for k, v in SYMBOL_REPLACEMENTS.items():
269
+ text = text.replace(k, v)
270
+ return text
271
+
272
+
273
+ def _replace_text_phrases(text: str) -> str:
274
+ # longest first so "greater than or equal to" is replaced before "greater than"
275
+ for k in sorted(TEXT_REPLACEMENTS.keys(), key=len, reverse=True):
276
+ text = re.sub(rf"\b{re.escape(k)}\b", TEXT_REPLACEMENTS[k], text, flags=re.I)
277
+ return text
278
+
279
+
280
+ def _normalize_roots(text: str) -> str:
281
+ # "sqrt 9" -> "sqrt(9)"
282
+ text = re.sub(r"\bsqrt\s+([a-z0-9\(\)\/\+\-\*\.]+)", r"sqrt(\1)", text, flags=re.I)
283
+ text = re.sub(r"\bcbrt\s+([a-z0-9\(\)\/\+\-\*\.]+)", r"cbrt(\1)", text, flags=re.I)
284
+ return text
285
+
286
+
287
+ def _normalize_percent_expressions(text: str) -> str:
288
+ # "25 percent of 80" -> "(25/100) * 80"
289
+ text = re.sub(
290
+ r"(\d+(?:\.\d+)?)\s*percent\s+of\s+(\d+(?:\.\d+)?)",
291
+ r"(\1/100) * \2",
292
+ text,
293
+ flags=re.I,
294
+ )
295
+
296
+ # "x percent" -> "(x/100)"
297
+ text = re.sub(
298
+ r"(\d+(?:\.\d+)?)\s*percent\b",
299
+ r"(\1/100)",
300
+ text,
301
+ flags=re.I,
302
+ )
303
+
304
+ # per-mille
305
+ text = re.sub(
306
+ r"(\d+(?:\.\d+)?)\s*permille\b",
307
+ r"(\1/1000)",
308
+ text,
309
+ flags=re.I,
310
+ )
311
+ return text
312
+
313
+
314
+ def _normalize_multiplication_spacing(text: str) -> str:
315
+ # 5x -> 5*x
316
+ text = re.sub(r"(\d)([a-zA-Z])", r"\1*\2", text)
317
+ # )x -> )*x
318
+ text = re.sub(r"(\))([a-zA-Z0-9])", r"\1*\2", text)
319
+ # x( -> x*(
320
+ text = re.sub(r"([a-zA-Z0-9])(\()", r"\1*\2", text)
321
+ return text
322
+
323
+
324
+ def normalize_math_text(text: str) -> str:
325
+ if not text:
326
+ return ""
327
+
328
+ text = unicodedata.normalize("NFKC", text)
329
+ text = _replace_unicode_fractions(text)
330
+ text = _replace_superscripts_and_subscripts(text)
331
+ text = _replace_symbol_chars(text)
332
+ text = _replace_text_phrases(text)
333
+ text = _normalize_roots(text)
334
+ text = _normalize_percent_expressions(text)
335
+ text = _normalize_multiplication_spacing(text)
336
+
337
+ # normalize repeated spaces
338
+ text = re.sub(r"\s+", " ", text).strip()
339
+
340
+ return text
341
+
342
+
343
+ def normalize_for_solver(text: str) -> str:
344
+ text = normalize_math_text(text)
345
+
346
+ # make some solver-oriented aliases
347
+ text = text.replace("pi", "3.141592653589793")
348
+ text = text.replace("approx", "~")
349
+
350
+ return text
351
+
352
+
353
+ def normalize_for_parser(text: str) -> str:
354
+ text = normalize_math_text(text)
355
+
356
+ # keep semantic tokens for router/parser
357
+ return text