| from sklearn.feature_extraction.text import CountVectorizer |
| from data_analysis import df |
| from sklearn.preprocessing import LabelEncoder |
| from data_splitting import y_train, y_val |
| import tensorflow as tf |
|
|
| |
| |
| features = df['clean_text'] |
|
|
| cv = CountVectorizer() |
| features = cv.fit_transform(features) |
|
|
| |
| features = features.astype('uint8') |
|
|
|
|
| |
| |
| le = LabelEncoder() |
| df['language_encoded'] = le.fit_transform(df['language']) |
|
|
| targets = df['language_encoded'] |
|
|
| y_train_encoded = tf.keras.utils.to_categorical(y_train, num_classes=22) |
| y_val_encoded = tf.keras.utils.to_categorical(y_val, num_classes=22) |