Spaces:
Runtime error
Runtime error
| # scr/chunk_text.py | |
| from dotenv import load_dotenv | |
| import os | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| load_dotenv() | |
| QDRANT_URL = os.getenv("QDRANT_URL") | |
| QDRANT_API_KEY = os.getenv("QDRANT_API_KEY") | |
| # المسارات | |
| BASE_PATH = os.getcwd() | |
| CLEAN_FOLDER = os.path.join(BASE_PATH, "data", "clean") | |
| CHUNK_FOLDER = os.path.join(BASE_PATH, "data", "chunks") | |
| # إنشاء فولدر chunks إذا ما موجود | |
| os.makedirs(CHUNK_FOLDER, exist_ok=True) | |
| # إعدادات التقطيع | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=500, | |
| chunk_overlap=50, | |
| length_function=len | |
| ) | |
| # ✅ الدالة الجديدة - تستقبل text مباشرة | |
| def chunk_text(text): | |
| """ | |
| تقسيم نص واحد إلى chunks | |
| Args: | |
| text (str): النص المراد تقسيمه | |
| Returns: | |
| list: قائمة بالـ chunks | |
| """ | |
| if not text or len(text.strip()) == 0: | |
| return [] | |
| chunks = text_splitter.split_text(text) | |
| return chunks | |
| # ✅ الدالة القديمة - للتوافق مع الكود القديم | |
| def chunk_all_clean_files(): | |
| """ | |
| تقسيم كل الملفات في مجلد clean | |
| (الدالة القديمة - للـ backward compatibility) | |
| """ | |
| print("📌 Chunking files...\n") | |
| for filename in os.listdir(CLEAN_FOLDER): | |
| if filename.endswith(".txt"): | |
| file_path = os.path.join(CLEAN_FOLDER, filename) | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| text = f.read() | |
| chunks = text_splitter.split_text(text) | |
| # حفظ الشنكات | |
| output_path = os.path.join(CHUNK_FOLDER, filename) | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| for chunk in chunks: | |
| f.write(chunk + "\n---CHUNK---\n") | |
| print(f"✔ تم تقطيع الملف: {filename} → {len(chunks)} chunks") | |
| print("\n🎉 Done! All files chunked successfully.") | |
| if __name__ == "__main__": | |
| chunk_all_clean_files() | |
| print("done") |