Spaces:
Runtime error
Runtime error
| from dotenv import load_dotenv | |
| import os | |
| load_dotenv() | |
| QDRANT_URL = os.getenv("QDRANT_URL") | |
| QDRANT_API_KEY = os.getenv("QDRANT_API_KEY") | |
| import os | |
| import re | |
| BASE_PATH = os.getcwd() | |
| INPUT_FOLDER = os.path.join(BASE_PATH, "data", "processed") | |
| OUTPUT_FOLDER = os.path.join(BASE_PATH, "data", "clean") | |
| os.makedirs(OUTPUT_FOLDER, exist_ok=True) | |
| def clean_text(text): | |
| # Remove weird unicode characters | |
| text = text.encode("utf-8", "ignore").decode("utf-8", "ignore") | |
| # Remove multiple spaces | |
| text = re.sub(r"\s+", " ", text) | |
| # Remove lines with only symbols | |
| text = re.sub(r"[^\w\s.,?!\-–—/]+", "", text) | |
| # Remove extra newlines | |
| text = re.sub(r"\n+", "\n", text) | |
| return text.strip() | |
| def clean_all_files(): | |
| print("Cleaning text files...") | |
| print("Input:", INPUT_FOLDER) | |
| print("Output:", OUTPUT_FOLDER) | |
| for file in os.listdir(INPUT_FOLDER): | |
| if file.endswith(".txt"): | |
| in_path = os.path.join(INPUT_FOLDER, file) | |
| out_path = os.path.join(OUTPUT_FOLDER, file) | |
| with open(in_path, "r", encoding="utf-8", errors="ignore") as f: | |
| raw = f.read() | |
| cleaned = clean_text(raw) | |
| with open(out_path, "w", encoding="utf-8") as f: | |
| f.write(cleaned) | |
| print("Cleaned:", file) | |
| print("\n✨ Done! Text cleaned successfully.") | |
| if __name__ == "__main__": | |
| clean_all_files() | |
| print("done") |