from dotenv import load_dotenv import os load_dotenv() QDRANT_URL = os.getenv("QDRANT_URL") QDRANT_API_KEY = os.getenv("QDRANT_API_KEY") import os import re BASE_PATH = os.getcwd() INPUT_FOLDER = os.path.join(BASE_PATH, "data", "processed") OUTPUT_FOLDER = os.path.join(BASE_PATH, "data", "clean") os.makedirs(OUTPUT_FOLDER, exist_ok=True) def clean_text(text): # Remove weird unicode characters text = text.encode("utf-8", "ignore").decode("utf-8", "ignore") # Remove multiple spaces text = re.sub(r"\s+", " ", text) # Remove lines with only symbols text = re.sub(r"[^\w\s.,?!\-–—/]+", "", text) # Remove extra newlines text = re.sub(r"\n+", "\n", text) return text.strip() def clean_all_files(): print("Cleaning text files...") print("Input:", INPUT_FOLDER) print("Output:", OUTPUT_FOLDER) for file in os.listdir(INPUT_FOLDER): if file.endswith(".txt"): in_path = os.path.join(INPUT_FOLDER, file) out_path = os.path.join(OUTPUT_FOLDER, file) with open(in_path, "r", encoding="utf-8", errors="ignore") as f: raw = f.read() cleaned = clean_text(raw) with open(out_path, "w", encoding="utf-8") as f: f.write(cleaned) print("Cleaned:", file) print("\n✨ Done! Text cleaned successfully.") if __name__ == "__main__": clean_all_files() print("done")