UniversityAI / clean_text.py
Alsmwal's picture
Upload 28 files
18ad9a9 verified
raw
history blame
1.5 kB
from dotenv import load_dotenv
import os
load_dotenv()
QDRANT_URL = os.getenv("QDRANT_URL")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
import os
import re
BASE_PATH = os.getcwd()
INPUT_FOLDER = os.path.join(BASE_PATH, "data", "processed")
OUTPUT_FOLDER = os.path.join(BASE_PATH, "data", "clean")
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
def clean_text(text):
# Remove weird unicode characters
text = text.encode("utf-8", "ignore").decode("utf-8", "ignore")
# Remove multiple spaces
text = re.sub(r"\s+", " ", text)
# Remove lines with only symbols
text = re.sub(r"[^\w\s.,?!\-–—/]+", "", text)
# Remove extra newlines
text = re.sub(r"\n+", "\n", text)
return text.strip()
def clean_all_files():
print("Cleaning text files...")
print("Input:", INPUT_FOLDER)
print("Output:", OUTPUT_FOLDER)
for file in os.listdir(INPUT_FOLDER):
if file.endswith(".txt"):
in_path = os.path.join(INPUT_FOLDER, file)
out_path = os.path.join(OUTPUT_FOLDER, file)
with open(in_path, "r", encoding="utf-8", errors="ignore") as f:
raw = f.read()
cleaned = clean_text(raw)
with open(out_path, "w", encoding="utf-8") as f:
f.write(cleaned)
print("Cleaned:", file)
print("\n✨ Done! Text cleaned successfully.")
if __name__ == "__main__":
clean_all_files()
print("done")