| |
| """250405_Eum_lstm.ipynb |
| |
| Automatically generated by Colab. |
| |
| Original file is located at |
| https://colab.research.google.com/drive/1-ABY8VONWw9g6ESYhz0Cir0qcrT-OfIW |
| """ |
|
|
| import torch |
| import os |
| from google.colab import files |
| import pandas as pd |
| from sklearn.model_selection import train_test_split |
| import re |
| import nltk |
| from nltk.tokenize import word_tokenize |
| from collections import Counter |
| from torch.utils.data import Dataset, DataLoader |
| import torch.nn as nn |
|
|
| |
|
|
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| print("์ฌ์ฉ ๊ฐ๋ฅํ device:", device) |
| os.environ["CUDA_LAUNCH_BLOCKING"] = "1" |
|
|
| |
| uploaded = files.upload() |
|
|
| |
|
|
| |
| data = pd.read_csv('lgaidataset (1).csv') |
|
|
| |
| max_index = data['index'].max() |
|
|
| |
| nan_count = data['index'].isna().sum() |
|
|
| |
| new_values = range(int(max_index) + 1, int(max_index) + 1 + nan_count) |
|
|
| |
| data.loc[data['index'].isna(), 'index'] = new_values |
|
|
| data.loc[:, 'class'] = data['class'] + 1 |
|
|
| |
|
|
| train_df, test_df = train_test_split(data, test_size=0.2, random_state=42) |
| print("Train ๋ฐ์ดํฐ ํฌ๊ธฐ:", train_df.shape) |
| print("Test ๋ฐ์ดํฐ ํฌ๊ธฐ:", test_df.shape) |
|
|
| |
|
|
| nltk.download('punkt') |
| nltk.download('punkt_tab') |
|
|
| def preprocess_text(text): |
| """ |
| ์
๋ ฅ๋ ํ
์คํธ๋ฅผ ์๋ฌธ์ํ, ํน์๋ฌธ์ ์ ๊ฑฐ ํ ๋จ์ด ๋จ์๋ก ํ ํฐํํฉ๋๋ค. |
| """ |
| text = text.lower() |
| text = re.sub(r'[^๊ฐ-ํฃa-z0-9\s]', '', text) |
| tokens = word_tokenize(text) |
| return tokens |
|
|
| """์๋ฌด ๋ฐ์ดํฐ์ ๊ดํด preprocess_text ํจ์ ์คํํด์ ํ๊ตญ์ด ํ ํฐํ ๋๋์ง ํ์ธํ๊ธฐ""" |
|
|
| |
|
|
| all_tokens = [] |
| |
| for msg in train_df['content']: |
| tokens = preprocess_text(msg) |
| all_tokens.extend(tokens) |
|
|
| |
| word_counts = Counter(all_tokens) |
| |
| |
| vocab = {word: i+2 for i, (word, count) in enumerate(word_counts.items()) if count >= 1} |
| vocab["<PAD>"] = 0 |
| vocab["<UNK>"] = 1 |
| vocab_size = len(vocab)+1 |
| print("๋จ์ด ์งํฉ ํฌ๊ธฐ:", vocab_size) |
|
|
| |
| def text_to_sequence(text, vocab, max_len=50): |
| """ |
| ํ
์คํธ๋ฅผ ํ ํฐํํ ํ ๋จ์ด๋ฅผ ํด๋น ์ธ๋ฑ์ค๋ก ๋ณํํฉ๋๋ค. |
| max_len๋ณด๋ค ์งง์ผ๋ฉด ํจ๋ฉ(<PAD>), ๊ธธ๋ฉด ์๋ฆ
๋๋ค. |
| """ |
| tokens = preprocess_text(text) |
| seq = [vocab.get(token, vocab["<UNK>"]) for token in tokens] |
| if len(seq) < max_len: |
| seq = seq + [vocab["<PAD>"]] * (max_len - len(seq)) |
| else: |
| seq = seq[:max_len] |
| return seq |
|
|
| |
|
|
| class SpamDataset(Dataset): |
| def __init__(self, df, vocab, max_len=50): |
| self.messages = df['content'].tolist() |
| self.labels = df['class'].tolist() |
| self.vocab = vocab |
| self.max_len = max_len |
|
|
| def __len__(self): |
| return len(self.messages) |
|
|
| def __getitem__(self, idx): |
| message = self.messages[idx] |
| label = self.labels[idx] |
| |
| seq = text_to_sequence(message, self.vocab, self.max_len) |
| |
| return torch.tensor(seq, dtype=torch.long), torch.tensor(label-1, dtype=torch.long) |
|
|
| |
| batch_size = 64 |
| train_dataset = SpamDataset(train_df, vocab) |
| test_dataset = SpamDataset(test_df, vocab) |
|
|
| train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) |
| test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) |
|
|
| |
| class LSTMClassifier(nn.Module): |
| def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, num_layers=1): |
| super(LSTMClassifier, self).__init__() |
| |
| self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0) |
| |
| self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True) |
| |
| self.fc = nn.Linear(hidden_dim, output_dim) |
|
|
| def forward(self, x): |
| |
| x = self.embedding(x) |
| out, (hn, cn) = self.lstm(x) |
| out = out[:, -1, :] |
| out = self.fc(out) |
| return out |
|
|
| |
| embed_dim = 100 |
| hidden_dim = 128 |
| output_dim = 5 |
| num_layers = 1 |
| |
|
|
| |
| model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, output_dim, num_layers) |
| model.to(device) |
|
|
| |
| criterion = nn.CrossEntropyLoss() |
| |
| optimizer = torch.optim.AdamW(model.parameters(), lr=0.0005) |
|
|
| |
| num_epochs = 3 |
|
|
| for epoch in range(num_epochs): |
| model.train() |
| epoch_loss = 0 |
| for batch_idx, (inputs, labels) in enumerate(train_loader): |
| inputs, labels = inputs.to(device), labels.to(device) |
|
|
| optimizer.zero_grad() |
| outputs = model(inputs) |
| loss = criterion(outputs, labels) |
| loss.backward() |
| optimizer.step() |
|
|
| epoch_loss += loss.item() |
|
|
| if (batch_idx + 1) % 100 == 0: |
| print(f"Epoch {epoch+1}/{num_epochs}, Iteration {batch_idx+1}/{len(train_loader)}, Loss: {loss.item():.4f}") |
|
|
|
|
| print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(train_loader):.4f}") |
|
|
| |
| model.eval() |
| correct = 0 |
| total = 0 |
|
|
| with torch.no_grad(): |
| for inputs, labels in test_loader: |
| inputs, labels = inputs.to(device), labels.to(device) |
| outputs = model(inputs) |
| _, predicted = torch.max(outputs, 1) |
| total += labels.size(0) |
| correct += (predicted == labels).sum().item() |
|
|
| print(f"Test Accuracy: {100 * correct / total:.2f}%") |
|
|
| """๋ด๊ฐ ๋ฃ์ ์
๋ ฅ์ ๋ถ๋ฅํ๋ ์ฝ๋ ์ถ๊ฐ""" |
|
|
| def predict_message(text): |
| """ |
| ์
๋ ฅ๋ ํ
์คํธ์ ๋ํด ์ ์ฒ๋ฆฌ, ์ซ์ ์ํ์ค ๋ณํ ํ ๋ชจ๋ธ ์์ธก์ ์ํํฉ๋๋ค. |
| ๋ฐํ๊ฐ์ ์์ธก๋ ํด๋์ค ๋ฒํธ(์๋ ๋ผ๋ฒจ, 1~5)์
๋๋ค. |
| """ |
| model.eval() |
| |
| seq = text_to_sequence(text, vocab, max_len=50) |
| |
| input_tensor = torch.tensor(seq, dtype=torch.long).unsqueeze(0).to(device) |
|
|
| with torch.no_grad(): |
| output = model(input_tensor) |
| |
| _, predicted = torch.max(output, 1) |
|
|
| predicted_class = predicted.item() + 1 |
| return predicted_class |
|
|
| |
| sample_text = input("์์ธกํ ๋ฉ์์ง๋ฅผ ์
๋ ฅํ์ธ์: ") |
| predicted_class = predict_message(sample_text) |
| print("์์ธก๋ ํด๋์ค:", predicted_class) |
|
|
| |
| |
| from google.colab import drive |
| drive.mount('/content/drive') |
|
|
| |
| save_dir = '/content/drive/My Drive/' |
| model_name = 'Eum_lstm_save.pth' |
|
|
| |
| model_path = os.path.join(save_dir, model_name) |
|
|
| |
| torch.save(model.state_dict(), model_path) |
| print(f"Model saved to {model_path}") |
|
|
| |
| from google.colab import files |
| files.download(model_path) |