Spaces:

davidr70
/

embedder

Build error

embedder / dataset_creator.py

code for creating dataset

124e2e4 about 1 year ago

1.66 kB

	# Read the JSON file
	import json
	import re
	import unicodedata

	def unicode_to_ascii(text):
	# Normalize to decomposed form (separate characters and combining marks)
	normalized = unicodedata.normalize('NFKD', text)

	# Remove non-ASCII chars (keeps only ASCII)
	ascii_text = normalized.encode('ascii', 'ignore').decode('ascii')

	return ascii_text

	def clean_html_tags(html_string):
	"""
	Remove all HTML tags from the input string.

	Args:
	html_string (str): String containing HTML tags

	Returns:
	str: String with all HTML tags removed
	"""
	# This pattern matches HTML tags: < followed by anything except >, then >
	pattern = re.compile(r'<[^>]+>')

	# Replace all occurrences of HTML tags with empty string
	clean_text = re.sub(pattern, '', html_string)
	super_clean_text = unicode_to_ascii(clean_text)
	return super_clean_text

	with open("Megillah_map_to_english.json", "r", encoding="utf-8") as file:
	megillah_data = file.readlines()

	with open("megillah_sugyot.json", "w", encoding="utf-8") as output_file:
	# Loop through each line in the file
	for line in megillah_data:
	full_talmud = json.loads(line)

	for sugya, texts in full_talmud.items():
	metadata = {"sugya": sugya, "sections": []}
	content = ""
	for text in texts:
	cleaned_text = clean_html_tags(text['english'])
	content += f"{cleaned_text} "
	metadata["sections"].append(text['sefaria_id'])
	output = {"id": sugya, "metadata": metadata, "content": content}
	output_file.write(f"{json.dumps(output)}\n")