ABSA Models (Vietnamese - Aspect-Based Sentiment Analysis)

Phân tích cảm xúc theo khía cạnh sản phẩm trên reviews tiếng Việt.

Inference model mặc định: PhoBERT

So sánh hiệu suất

Model Type Accuracy Precision Recall F1 Time(s)
PhoBERT Transformer 0.8854 0.6767 0.7489 0.7021 7647.6
SVM ML Classic 0.8675 0.6507 0.5157 0.5407 17771.0
LSTM Deep Learning 0.8639 0.6277 0.6528 0.6377 622.8

Cấu hình

  • Aspects: SCREEN, CAMERA, FEATURES, BATTERY, PERFORMANCE, STORAGE, DESIGN, PRICE, GENERAL, SER&ACC
  • Polarities: negative, neutral, positive, not_mentioned
  • Max Length: 256
  • Training: 15 epochs, lr=2e-05, batch_size=16
  • Dataset: visolex/ViSFD

Hướng dẫn sử dụng

1. PhoBERT (Transformer)

import torch
from torch import nn
from transformers import AutoModel, AutoTokenizer
from huggingface_hub import hf_hub_download
import py_vncorenlp

REPO_ID = "MiruYandex/phobert-absa-vietnamese"
ASPECT_COLUMNS = ['SCREEN', 'CAMERA', 'FEATURES', 'BATTERY', 'PERFORMANCE', 'STORAGE', 'DESIGN', 'PRICE', 'GENERAL', 'SER&ACC']
POLARITY_LABELS = ['negative', 'neutral', 'positive', 'not_mentioned']

# 1. Load tokenizer + model weights
tokenizer = AutoTokenizer.from_pretrained(REPO_ID, subfolder="phobert")
model_path = hf_hub_download(REPO_ID, filename="phobert/model.pt")

# 2. Reconstruct model
class PhoBERTForABSA(nn.Module):
    def __init__(self, model_name="vinai/phobert-base", num_aspects=10, num_polarities=4, dropout=0.3):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        hidden_size = self.bert.config.hidden_size
        self.dropout = nn.Dropout(dropout)
        self.attn_W = nn.Linear(hidden_size, hidden_size, bias=False)
        self.attn_v = nn.Linear(hidden_size, 1, bias=False)
        self.layer_norm = nn.LayerNorm(hidden_size)
        self.aspect_classifiers = nn.ModuleList([
            nn.Sequential(
                nn.Linear(hidden_size, 256), nn.ReLU(), nn.Dropout(dropout), nn.Linear(256, num_polarities)
            ) for _ in range(num_aspects)
        ])

    def forward(self, input_ids, attention_mask, **kwargs):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state  # (batch, seq_len, hidden)
        # Additive Attention pooling
        attn_scores = self.attn_v(torch.tanh(self.attn_W(hidden_states))).squeeze(-1)
        attn_scores = attn_scores.masked_fill(attention_mask == 0, float("-inf"))
        attn_weights = torch.softmax(attn_scores, dim=-1)
        attn_weights = attn_weights.masked_fill(attn_weights != attn_weights, 0.0)
        cls_output = torch.bmm(attn_weights.unsqueeze(1), hidden_states).squeeze(1)
        cls_output = self.layer_norm(cls_output)
        cls_output = self.dropout(cls_output)
        logits = torch.stack([clf(cls_output) for clf in self.aspect_classifiers], dim=1)
        return {"logits": logits}

model = PhoBERTForABSA()
model.load_state_dict(torch.load(model_path, map_location="cpu"))
model.eval()

# 3. Word segmentation (REQUIRED for PhoBERT)
rdrsegmenter = py_vncorenlp.VnCoreNLP(save_dir="/tmp/vncorenlp", annotators=["wseg"])

# 4. Predict
text = "Điện thoại đẹp, camera chụp rõ nét, pin trâu lắm"
segmented = " ".join(rdrsegmenter.word_segment(text))
inputs = tokenizer(segmented, return_tensors="pt", padding="max_length", truncation=True, max_length=256)
with torch.no_grad():
    logits = model(**inputs)["logits"]  # (1, 10, 4)
    preds = logits.argmax(dim=-1)[0]    # (10,)

for aspect, pred in zip(ASPECT_COLUMNS, preds):
    if pred.item() != 3:  # skip not_mentioned
        print(f"  {aspect}: {POLARITY_LABELS[pred.item()]}")

2. SVM (TF-IDF + RBF Kernel)

import pickle
from huggingface_hub import hf_hub_download

REPO_ID = "MiruYandex/phobert-absa-vietnamese"
ASPECT_COLUMNS = ['SCREEN', 'CAMERA', 'FEATURES', 'BATTERY', 'PERFORMANCE', 'STORAGE', 'DESIGN', 'PRICE', 'GENERAL', 'SER&ACC']
POLARITY_LABELS = ['negative', 'neutral', 'positive', 'not_mentioned']

# 1. Download model files
svm_path = hf_hub_download(REPO_ID, filename="svm/svm_models.pkl")
tfidf_path = hf_hub_download(REPO_ID, filename="svm/tfidf.pkl")

# 2. Load
with open(svm_path, "rb") as f:
    svm_models = pickle.load(f)  # dict: {aspect_name: SVC model}
with open(tfidf_path, "rb") as f:
    tfidf = pickle.load(f)

# 3. Predict
text = "Điện thoại đẹp, camera chụp rõ nét, pin trâu lắm"
X = tfidf.transform([text])

for aspect, svm in svm_models.items():
    pred = svm.predict(X)[0]
    if pred != 3:  # skip not_mentioned
        print(f"  {aspect}: {POLARITY_LABELS[pred]}")

3. LSTM (Bi-LSTM)

import torch
from torch import nn
from transformers import AutoTokenizer
from huggingface_hub import hf_hub_download

REPO_ID = "MiruYandex/phobert-absa-vietnamese"
ASPECT_COLUMNS = ['SCREEN', 'CAMERA', 'FEATURES', 'BATTERY', 'PERFORMANCE', 'STORAGE', 'DESIGN', 'PRICE', 'GENERAL', 'SER&ACC']
POLARITY_LABELS = ['negative', 'neutral', 'positive', 'not_mentioned']

# 1. Load
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
model_path = hf_hub_download(REPO_ID, filename="lstm/lstm_model.pt")

# 2. Reconstruct model
class LSTMForABSA(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=256, num_aspects=10, num_polarities=4, num_layers=2, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.aspect_classifiers = nn.ModuleList([nn.Linear(hidden_dim * 2, num_polarities) for _ in range(num_aspects)])

    def forward(self, input_ids, attention_mask=None, **kwargs):
        embedded = self.dropout(self.embedding(input_ids))
        if attention_mask is not None:
            lengths = attention_mask.sum(dim=1).cpu().clamp(min=1)
            packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths, batch_first=True, enforce_sorted=False)
            _, (hidden, _) = self.lstm(packed)
        else:
            _, (hidden, _) = self.lstm(embedded)
        hidden_cat = torch.cat([hidden[-2], hidden[-1]], dim=1)
        hidden_cat = self.dropout(hidden_cat)
        logits = torch.stack([clf(hidden_cat) for clf in self.aspect_classifiers], dim=1)
        return {"logits": logits}

model = LSTMForABSA(vocab_size=tokenizer.vocab_size)
model.load_state_dict(torch.load(model_path, map_location="cpu"))
model.eval()

# 3. Predict
text = "Điện thoại đẹp, camera chụp rõ nét, pin trâu lắm"
inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=256)
with torch.no_grad():
    logits = model(**inputs)["logits"]
    preds = logits.argmax(dim=-1)[0]

for aspect, pred in zip(ASPECT_COLUMNS, preds):
    if pred.item() != 3:
        print(f"  {aspect}: {POLARITY_LABELS[pred.item()]}")

Output format

Mỗi model trả về prediction cho 10 aspects, mỗi aspect có 4 classes:

  • 0 = negative
  • 1 = neutral
  • 2 = positive
  • 3 = not_mentioned (aspect không được đề cập)

Ví dụ output:

  CAMERA: positive
  BATTERY: positive
  DESIGN: positive
Downloads last month
112
Safetensors
Model size
0.1B params
Tensor type
F32
·
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support

Dataset used to train MiruYandex/phobert-absa-vietnamese