UniversityAI / chunk_text.py
Alsmwal's picture
Upload 28 files
18ad9a9 verified
raw
history blame
2.17 kB
# scr/chunk_text.py
from dotenv import load_dotenv
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter
load_dotenv()
QDRANT_URL = os.getenv("QDRANT_URL")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
# المسارات
BASE_PATH = os.getcwd()
CLEAN_FOLDER = os.path.join(BASE_PATH, "data", "clean")
CHUNK_FOLDER = os.path.join(BASE_PATH, "data", "chunks")
# إنشاء فولدر chunks إذا ما موجود
os.makedirs(CHUNK_FOLDER, exist_ok=True)
# إعدادات التقطيع
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50,
length_function=len
)
# ✅ الدالة الجديدة - تستقبل text مباشرة
def chunk_text(text):
"""
تقسيم نص واحد إلى chunks
Args:
text (str): النص المراد تقسيمه
Returns:
list: قائمة بالـ chunks
"""
if not text or len(text.strip()) == 0:
return []
chunks = text_splitter.split_text(text)
return chunks
# ✅ الدالة القديمة - للتوافق مع الكود القديم
def chunk_all_clean_files():
"""
تقسيم كل الملفات في مجلد clean
(الدالة القديمة - للـ backward compatibility)
"""
print("📌 Chunking files...\n")
for filename in os.listdir(CLEAN_FOLDER):
if filename.endswith(".txt"):
file_path = os.path.join(CLEAN_FOLDER, filename)
with open(file_path, "r", encoding="utf-8") as f:
text = f.read()
chunks = text_splitter.split_text(text)
# حفظ الشنكات
output_path = os.path.join(CHUNK_FOLDER, filename)
with open(output_path, "w", encoding="utf-8") as f:
for chunk in chunks:
f.write(chunk + "\n---CHUNK---\n")
print(f"✔ تم تقطيع الملف: {filename}{len(chunks)} chunks")
print("\n🎉 Done! All files chunked successfully.")
if __name__ == "__main__":
chunk_all_clean_files()
print("done")