ABSA Models (Vietnamese - Aspect-Based Sentiment Analysis)
Phân tích cảm xúc theo khía cạnh sản phẩm trên reviews tiếng Việt.
Inference model mặc định: PhoBERT
So sánh hiệu suất
| Model | Type | Accuracy | Precision | Recall | F1 | Time(s) |
|---|---|---|---|---|---|---|
| PhoBERT | Transformer | 0.8854 | 0.6767 | 0.7489 | 0.7021 | 7647.6 |
| SVM | ML Classic | 0.8675 | 0.6507 | 0.5157 | 0.5407 | 17771.0 |
| LSTM | Deep Learning | 0.8639 | 0.6277 | 0.6528 | 0.6377 | 622.8 |
Cấu hình
- Aspects:
SCREEN, CAMERA, FEATURES, BATTERY, PERFORMANCE, STORAGE, DESIGN, PRICE, GENERAL, SER&ACC - Polarities:
negative, neutral, positive, not_mentioned - Max Length: 256
- Training: 15 epochs, lr=2e-05, batch_size=16
- Dataset: visolex/ViSFD
Hướng dẫn sử dụng
1. PhoBERT (Transformer)
import torch
from torch import nn
from transformers import AutoModel, AutoTokenizer
from huggingface_hub import hf_hub_download
import py_vncorenlp
REPO_ID = "MiruYandex/phobert-absa-vietnamese"
ASPECT_COLUMNS = ['SCREEN', 'CAMERA', 'FEATURES', 'BATTERY', 'PERFORMANCE', 'STORAGE', 'DESIGN', 'PRICE', 'GENERAL', 'SER&ACC']
POLARITY_LABELS = ['negative', 'neutral', 'positive', 'not_mentioned']
# 1. Load tokenizer + model weights
tokenizer = AutoTokenizer.from_pretrained(REPO_ID, subfolder="phobert")
model_path = hf_hub_download(REPO_ID, filename="phobert/model.pt")
# 2. Reconstruct model
class PhoBERTForABSA(nn.Module):
def __init__(self, model_name="vinai/phobert-base", num_aspects=10, num_polarities=4, dropout=0.3):
super().__init__()
self.bert = AutoModel.from_pretrained(model_name)
hidden_size = self.bert.config.hidden_size
self.dropout = nn.Dropout(dropout)
self.attn_W = nn.Linear(hidden_size, hidden_size, bias=False)
self.attn_v = nn.Linear(hidden_size, 1, bias=False)
self.layer_norm = nn.LayerNorm(hidden_size)
self.aspect_classifiers = nn.ModuleList([
nn.Sequential(
nn.Linear(hidden_size, 256), nn.ReLU(), nn.Dropout(dropout), nn.Linear(256, num_polarities)
) for _ in range(num_aspects)
])
def forward(self, input_ids, attention_mask, **kwargs):
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
hidden_states = outputs.last_hidden_state # (batch, seq_len, hidden)
# Additive Attention pooling
attn_scores = self.attn_v(torch.tanh(self.attn_W(hidden_states))).squeeze(-1)
attn_scores = attn_scores.masked_fill(attention_mask == 0, float("-inf"))
attn_weights = torch.softmax(attn_scores, dim=-1)
attn_weights = attn_weights.masked_fill(attn_weights != attn_weights, 0.0)
cls_output = torch.bmm(attn_weights.unsqueeze(1), hidden_states).squeeze(1)
cls_output = self.layer_norm(cls_output)
cls_output = self.dropout(cls_output)
logits = torch.stack([clf(cls_output) for clf in self.aspect_classifiers], dim=1)
return {"logits": logits}
model = PhoBERTForABSA()
model.load_state_dict(torch.load(model_path, map_location="cpu"))
model.eval()
# 3. Word segmentation (REQUIRED for PhoBERT)
rdrsegmenter = py_vncorenlp.VnCoreNLP(save_dir="/tmp/vncorenlp", annotators=["wseg"])
# 4. Predict
text = "Điện thoại đẹp, camera chụp rõ nét, pin trâu lắm"
segmented = " ".join(rdrsegmenter.word_segment(text))
inputs = tokenizer(segmented, return_tensors="pt", padding="max_length", truncation=True, max_length=256)
with torch.no_grad():
logits = model(**inputs)["logits"] # (1, 10, 4)
preds = logits.argmax(dim=-1)[0] # (10,)
for aspect, pred in zip(ASPECT_COLUMNS, preds):
if pred.item() != 3: # skip not_mentioned
print(f" {aspect}: {POLARITY_LABELS[pred.item()]}")
2. SVM (TF-IDF + RBF Kernel)
import pickle
from huggingface_hub import hf_hub_download
REPO_ID = "MiruYandex/phobert-absa-vietnamese"
ASPECT_COLUMNS = ['SCREEN', 'CAMERA', 'FEATURES', 'BATTERY', 'PERFORMANCE', 'STORAGE', 'DESIGN', 'PRICE', 'GENERAL', 'SER&ACC']
POLARITY_LABELS = ['negative', 'neutral', 'positive', 'not_mentioned']
# 1. Download model files
svm_path = hf_hub_download(REPO_ID, filename="svm/svm_models.pkl")
tfidf_path = hf_hub_download(REPO_ID, filename="svm/tfidf.pkl")
# 2. Load
with open(svm_path, "rb") as f:
svm_models = pickle.load(f) # dict: {aspect_name: SVC model}
with open(tfidf_path, "rb") as f:
tfidf = pickle.load(f)
# 3. Predict
text = "Điện thoại đẹp, camera chụp rõ nét, pin trâu lắm"
X = tfidf.transform([text])
for aspect, svm in svm_models.items():
pred = svm.predict(X)[0]
if pred != 3: # skip not_mentioned
print(f" {aspect}: {POLARITY_LABELS[pred]}")
3. LSTM (Bi-LSTM)
import torch
from torch import nn
from transformers import AutoTokenizer
from huggingface_hub import hf_hub_download
REPO_ID = "MiruYandex/phobert-absa-vietnamese"
ASPECT_COLUMNS = ['SCREEN', 'CAMERA', 'FEATURES', 'BATTERY', 'PERFORMANCE', 'STORAGE', 'DESIGN', 'PRICE', 'GENERAL', 'SER&ACC']
POLARITY_LABELS = ['negative', 'neutral', 'positive', 'not_mentioned']
# 1. Load
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
model_path = hf_hub_download(REPO_ID, filename="lstm/lstm_model.pt")
# 2. Reconstruct model
class LSTMForABSA(nn.Module):
def __init__(self, vocab_size, embed_dim=128, hidden_dim=256, num_aspects=10, num_polarities=4, num_layers=2, dropout=0.3):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True, dropout=dropout)
self.dropout = nn.Dropout(dropout)
self.aspect_classifiers = nn.ModuleList([nn.Linear(hidden_dim * 2, num_polarities) for _ in range(num_aspects)])
def forward(self, input_ids, attention_mask=None, **kwargs):
embedded = self.dropout(self.embedding(input_ids))
if attention_mask is not None:
lengths = attention_mask.sum(dim=1).cpu().clamp(min=1)
packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths, batch_first=True, enforce_sorted=False)
_, (hidden, _) = self.lstm(packed)
else:
_, (hidden, _) = self.lstm(embedded)
hidden_cat = torch.cat([hidden[-2], hidden[-1]], dim=1)
hidden_cat = self.dropout(hidden_cat)
logits = torch.stack([clf(hidden_cat) for clf in self.aspect_classifiers], dim=1)
return {"logits": logits}
model = LSTMForABSA(vocab_size=tokenizer.vocab_size)
model.load_state_dict(torch.load(model_path, map_location="cpu"))
model.eval()
# 3. Predict
text = "Điện thoại đẹp, camera chụp rõ nét, pin trâu lắm"
inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=256)
with torch.no_grad():
logits = model(**inputs)["logits"]
preds = logits.argmax(dim=-1)[0]
for aspect, pred in zip(ASPECT_COLUMNS, preds):
if pred.item() != 3:
print(f" {aspect}: {POLARITY_LABELS[pred.item()]}")
Output format
Mỗi model trả về prediction cho 10 aspects, mỗi aspect có 4 classes:
0= negative1= neutral2= positive3= not_mentioned (aspect không được đề cập)
Ví dụ output:
CAMERA: positive
BATTERY: positive
DESIGN: positive
- Downloads last month
- 112
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support