| |
| import json |
| import re |
| import unicodedata |
|
|
| def unicode_to_ascii(text): |
| |
| normalized = unicodedata.normalize('NFKD', text) |
|
|
| |
| ascii_text = normalized.encode('ascii', 'ignore').decode('ascii') |
|
|
| return ascii_text |
|
|
| def clean_html_tags(html_string): |
| """ |
| Remove all HTML tags from the input string. |
| |
| Args: |
| html_string (str): String containing HTML tags |
| |
| Returns: |
| str: String with all HTML tags removed |
| """ |
| |
| pattern = re.compile(r'<[^>]+>') |
|
|
| |
| clean_text = re.sub(pattern, '', html_string) |
| super_clean_text = unicode_to_ascii(clean_text) |
| return super_clean_text |
|
|
| with open("Megillah_map_to_english.json", "r", encoding="utf-8") as file: |
| megillah_data = file.readlines() |
|
|
| with open("megillah_sugyot.json", "w", encoding="utf-8") as output_file: |
| |
| for line in megillah_data: |
| full_talmud = json.loads(line) |
|
|
| for sugya, texts in full_talmud.items(): |
| metadata = {"sugya": sugya, "sections": []} |
| content = "" |
| for text in texts: |
| cleaned_text = clean_html_tags(text['english']) |
| content += f"{cleaned_text} " |
| metadata["sections"].append(text['sefaria_id']) |
| output = {"id": sugya, "metadata": metadata, "content": content} |
| output_file.write(f"{json.dumps(output)}\n") |
|
|
|
|