| import pandas as pd
|
| from sklearn.model_selection import train_test_split
|
| from sklearn.feature_extraction.text import TfidfVectorizer
|
| from sklearn.linear_model import LogisticRegression
|
| from sklearn.pipeline import Pipeline
|
| import joblib
|
| import os
|
|
|
|
|
| DATA_FILE = "clickbait_data.csv"
|
|
|
| def train_model():
|
| """
|
| الدالة الرئيسية لتدريب النموذج وحفظه
|
| """
|
| print("Starting model training...")
|
|
|
|
|
|
|
| if not os.path.exists(DATA_FILE):
|
| print(f"Error: '{DATA_FILE}' not found in the current directory.")
|
| print("Please make sure the dataset is present before running the training.")
|
| print("You can download it from Kaggle: https://www.kaggle.com/datasets/amananandrai/clickbait-dataset")
|
| exit()
|
|
|
| try:
|
| df = pd.read_csv(DATA_FILE)
|
| except Exception as e:
|
| print(f"Error reading {DATA_FILE}: {e}")
|
| exit()
|
|
|
| print(f"Dataset loaded: {len(df)} headlines.")
|
|
|
|
|
| X = df['headline']
|
| y = df['clickbait']
|
|
|
|
|
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
|
|
|
|
|
|
| model_pipeline = Pipeline([
|
| ('vectorizer', TfidfVectorizer(max_features=5000)),
|
| ('classifier', LogisticRegression(max_iter=1000))
|
| ])
|
|
|
|
|
| print("Training the model... (This may take a minute)")
|
| model_pipeline.fit(X_train, y_train)
|
|
|
|
|
| accuracy = model_pipeline.score(X_test, y_test)
|
| print(f"Training complete. Model accuracy: {accuracy * 100:.2f}%")
|
|
|
|
|
| joblib.dump(model_pipeline, "clickbait_model.pkl")
|
|
|
| print("Model saved successfully as 'clickbait_model.pkl'")
|
|
|
|
|
|
|
| if __name__ == "__main__":
|
| train_model() |