| import torch
|
| import torch.nn as nn
|
| import torch.optim as optim
|
| import pandas as pd
|
| from collections import Counter
|
| from sklearn.preprocessing import LabelEncoder
|
| from torch.utils.data import Dataset, DataLoader
|
| import pickle
|
| import re
|
| from nltk.corpus import stopwords
|
| from nltk.stem import WordNetLemmatizer
|
| import gradio as gr
|
| import os
|
| import nltk
|
|
|
|
|
| nltk.download("stopwords", quiet=True)
|
| nltk.download("wordnet", quiet=True)
|
|
|
|
|
| stop_words = set(stopwords.words("english"))
|
| lemmatizer = WordNetLemmatizer()
|
|
|
|
|
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
|
| class AmazonReviewDataset(Dataset):
|
| def __init__(self, csv_file, max_length=50, sample_fraction=0.01, max_vocab_size=5000):
|
|
|
| print("Loading dataset from:", csv_file)
|
| self.data = pd.read_csv(csv_file, header=None, names=["label", "title", "text"])
|
| self.data = self.data.sample(frac=sample_fraction, random_state=42).reset_index(drop=True)
|
| print(f"Using {len(self.data)} samples ({sample_fraction * 100:.2f}% of the dataset).")
|
|
|
|
|
| self.data["text"] = self.data["text"].apply(self.clean_text)
|
|
|
|
|
| self.max_length = max_length
|
| self.vocab = {"<PAD>": 0, "<UNK>": 1}
|
| self.label_encoder = LabelEncoder()
|
|
|
|
|
| print("Building vocabulary...")
|
| self._build_vocab(max_vocab_size)
|
| print("Vocabulary built successfully.")
|
|
|
|
|
| self.label_encoder.fit(self.data["label"])
|
|
|
| def clean_text(self, text):
|
|
|
| text = re.sub(r"[^a-zA-Z\s]", "", text)
|
|
|
| text = text.lower()
|
|
|
| text = " ".join([word for word in text.split() if word not in stop_words])
|
|
|
| text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
|
| return text
|
|
|
| def _build_vocab(self, max_vocab_size):
|
|
|
| all_text = self.data["title"].astype(str) + " " + self.data["text"].astype(str)
|
| all_text = all_text.fillna("")
|
| all_text = all_text[:50000]
|
|
|
|
|
| token_counts = Counter()
|
| chunk_size = 5000
|
| for i in range(0, len(all_text), chunk_size):
|
| chunk = all_text[i:i + chunk_size]
|
| tokens = " ".join(chunk).split()
|
| token_counts.update(tokens)
|
| print(f"Processed {min(i + chunk_size, len(all_text))} rows...")
|
|
|
|
|
| most_common_tokens = [token for token, _ in token_counts.most_common(max_vocab_size)]
|
| for token in most_common_tokens:
|
| self.vocab[token] = len(self.vocab)
|
|
|
| def __len__(self):
|
| return len(self.data)
|
|
|
| def __getitem__(self, idx):
|
| label = self.data.iloc[idx]["label"]
|
| title = str(self.data.iloc[idx]["title"])
|
| text = str(self.data.iloc[idx]["text"])
|
| combined_text = title + " " + text
|
| tokens = combined_text.split()[:self.max_length]
|
| token_ids = [self.vocab.get(token, self.vocab["<UNK>"]) for token in tokens]
|
| padding = [self.vocab["<PAD>"]] * (self.max_length - len(token_ids))
|
| token_ids += padding
|
| label_encoded = self.label_encoder.transform([label])[0]
|
| return torch.tensor(token_ids, dtype=torch.long).to(device), torch.tensor(label_encoded, dtype=torch.long).to(device)
|
|
|
|
|
|
|
| class PolicyNetwork(nn.Module):
|
| def __init__(self, vocab_size, embed_dim=32, hidden_dim=128, num_classes=2):
|
| super(PolicyNetwork, self).__init__()
|
| self.embedding = nn.Embedding(vocab_size, embed_dim)
|
| self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
|
| self.fc = nn.Linear(hidden_dim * 2, num_classes)
|
|
|
| def forward(self, x):
|
| embedded = self.embedding(x)
|
| lstm_out, _ = self.lstm(embedded)
|
| out = self.fc(lstm_out[:, -1, :])
|
| return out
|
|
|
|
|
|
|
| def train_rl_model(dataset, policy_net, optimizer, num_episodes=3, entropy_weight=0.01, lr=0.001, batch_size=16):
|
| dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=4)
|
| for episode in range(num_episodes):
|
| print(f"Episode {episode + 1} started.")
|
| total_reward = 0
|
| for batch in dataloader:
|
| tokenized_reviews, true_labels = batch
|
| logits = policy_net(tokenized_reviews)
|
| probs = torch.softmax(logits, dim=-1)
|
| actions = torch.multinomial(probs, 1).squeeze()
|
|
|
|
|
| rewards = [1 if action == label else -1 for action, label in zip(actions, true_labels)]
|
| rewards_tensor = torch.tensor(rewards, dtype=torch.float32).to(device)
|
| rewards_tensor = (rewards_tensor - rewards_tensor.mean()) / (rewards_tensor.std() + 1e-8)
|
|
|
|
|
| loss = 0
|
| entropy_loss = 0
|
| for i, action in enumerate(actions):
|
| log_prob = torch.log(probs[i, action] + 1e-8)
|
| loss += -log_prob * rewards_tensor[i]
|
| entropy_loss += -(probs[i] * torch.log(probs[i] + 1e-8)).sum()
|
|
|
| loss += entropy_weight * entropy_loss
|
|
|
|
|
| optimizer.zero_grad()
|
| loss.backward()
|
| torch.nn.utils.clip_grad_norm_(policy_net.parameters(), max_norm=1.0)
|
| optimizer.step()
|
|
|
| total_reward += sum(rewards)
|
|
|
| print(f"Episode {episode + 1}, Total Reward: {total_reward}, Loss: {loss.item()}")
|
|
|
|
|
| with open("policy_net.pkl", "wb") as f:
|
| pickle.dump(policy_net.state_dict(), f)
|
| print("Model saved successfully as policy_net.pkl")
|
|
|
|
|
|
|
| def evaluate_model(dataset, policy_net):
|
| dataloader = DataLoader(dataset, batch_size=16, shuffle=False, num_workers=4)
|
| correct = 0
|
| total = 0
|
| policy_net.eval()
|
| with torch.no_grad():
|
| for batch in dataloader:
|
| tokenized_reviews, true_labels = batch
|
| logits = policy_net(tokenized_reviews)
|
| probs = torch.softmax(logits, dim=-1)
|
| predicted_classes = torch.argmax(probs, dim=-1)
|
| correct += (predicted_classes == true_labels).sum().item()
|
| total += true_labels.size(0)
|
| accuracy = correct / total
|
| print(f"Accuracy: {accuracy * 100:.2f}%")
|
| return accuracy
|
|
|
|
|
|
|
| def predict_review(review_text):
|
| with open("vocab.pkl", "rb") as f:
|
| vocab = pickle.load(f)
|
| with open("label_encoder.pkl", "rb") as f:
|
| label_encoder = pickle.load(f)
|
|
|
| tokenized_input = review_text.split()[:50]
|
| token_ids = [vocab.get(word, vocab["<UNK>"]) for word in tokenized_input]
|
| padding = [vocab["<PAD>"]] * (50 - len(token_ids))
|
| token_ids += padding
|
| token_ids = torch.tensor(token_ids).unsqueeze(0).to(device)
|
|
|
| policy_net = PolicyNetwork(len(vocab), embed_dim=32, hidden_dim=128, num_classes=2).to(device)
|
| with open("policy_net.pkl", "rb") as f:
|
| policy_net.load_state_dict(pickle.load(f))
|
| policy_net.eval()
|
|
|
| with torch.no_grad():
|
| logits = policy_net(token_ids)
|
| probs = torch.softmax(logits, dim=-1)
|
| predicted_class = torch.argmax(probs, dim=-1).item()
|
| predicted_label = label_encoder.inverse_transform([predicted_class])[0]
|
| return predicted_label
|
|
|
|
|
|
|
| if __name__ == "__main__":
|
| train_csv_path = r"D:\b\train.csv"
|
| test_csv_path = r"D:\b\test.csv"
|
| sample_fraction = 0.01
|
| max_vocab_size = 5000
|
| num_episodes = 3
|
| batch_size = 16
|
| lr = 0.001
|
| entropy_weight = 0.01
|
|
|
|
|
| train_dataset = AmazonReviewDataset(train_csv_path, sample_fraction=sample_fraction, max_vocab_size=max_vocab_size)
|
| test_dataset = AmazonReviewDataset(test_csv_path, sample_fraction=sample_fraction, max_vocab_size=max_vocab_size)
|
| print("Dataset loaded successfully.")
|
|
|
|
|
| policy_net = PolicyNetwork(len(train_dataset.vocab), embed_dim=32, hidden_dim=128, num_classes=2).to(device)
|
| optimizer = optim.Adam(policy_net.parameters(), lr=lr)
|
|
|
|
|
| train_rl_model(train_dataset, policy_net, optimizer, num_episodes=num_episodes, entropy_weight=entropy_weight, lr=lr, batch_size=batch_size)
|
|
|
|
|
| evaluate_model(test_dataset, policy_net)
|
|
|
|
|
| with open("vocab.pkl", "wb") as f:
|
| pickle.dump(train_dataset.vocab, f)
|
| with open("label_encoder.pkl", "wb") as f:
|
| pickle.dump(train_dataset.label_encoder, f)
|
| print("Vocabulary and label encoder saved successfully.")
|
|
|
|
|
| iface = gr.Interface(
|
| fn=predict_review,
|
| inputs="text",
|
| outputs="text",
|
| title="Amazon Review Sentiment Analysis",
|
| description="Enter a review to predict its sentiment (Positive/Negative)." )
|
|
|
| iface.launch(share=True) |