File size: 19,993 Bytes
eeb1881 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 | """
Generate UPDATED IEEE-format PDF paper with:
- Real TweetEval results
- 5 embedded figures
- Honest domain gap analysis
- Proper IEEE formatting
"""
from fpdf import FPDF
import os, json
class IEEEPaper(FPDF):
def __init__(self):
super().__init__('P', 'mm', 'Letter')
self.set_auto_page_break(auto=True, margin=25)
def header(self):
if self.page_no() > 1:
self.set_font('Helvetica', 'I', 8)
self.set_text_color(128, 128, 128)
self.cell(0, 5, 'IEEE Conference Paper - Transformer-Based Social Media Sentiment Analysis', 0, 0, 'C')
self.ln(8)
self.set_text_color(0, 0, 0)
def footer(self):
self.set_y(-15)
self.set_font('Helvetica', 'I', 8)
self.set_text_color(128, 128, 128)
self.cell(0, 10, f'{self.page_no()}', 0, 0, 'C')
self.set_text_color(0, 0, 0)
def rx(self): self.set_x(self.l_margin)
def section(self, num, title):
self.rx(); self.ln(4)
self.set_font('Helvetica', 'B', 11)
t = f'{num}. {title.upper()}' if num else title.upper()
self.cell(0, 7, t, 0, 1, 'C')
self.ln(1); self.rx()
def subsection(self, title):
self.rx(); self.ln(2)
self.set_font('Helvetica', 'B', 10)
self.multi_cell(0, 6, title, 0, 'L')
self.ln(1); self.rx()
def text(self, content):
self.rx()
self.set_font('Helvetica', '', 9)
self.multi_cell(0, 4.5, content, 0, 'J')
self.ln(1); self.rx()
def bullets(self, items):
self.rx()
self.set_font('Helvetica', '', 9)
for item in items:
self.set_x(self.l_margin)
self.multi_cell(0, 4.5, f' - {item}', 0, 'J')
self.ln(1); self.rx()
def numbers(self, items):
self.rx()
self.set_font('Helvetica', '', 9)
for i, item in enumerate(items, 1):
self.set_x(self.l_margin)
self.multi_cell(0, 4.5, f' {i}) {item}', 0, 'J')
self.ln(1); self.rx()
def table(self, caption, headers, rows, widths=None):
self.rx(); self.ln(2)
self.set_font('Helvetica', 'B', 9)
self.cell(0, 5, caption, 0, 1, 'C')
self.ln(1)
n = len(headers)
if widths is None: widths = [170/n]*n
tw = sum(widths)
x0 = (self.w - tw) / 2
self.set_x(x0)
self.set_font('Helvetica', 'B', 7.5)
self.set_fill_color(220, 220, 220)
for i, h in enumerate(headers):
self.cell(widths[i], 6, h, 1, 0, 'C', True)
self.ln()
self.set_font('Helvetica', '', 7.5)
for row in rows:
self.set_x(x0)
bold = row[0].startswith('**')
if bold:
row = [r.strip('*') for r in row]
self.set_font('Helvetica', 'B', 7.5)
for i, cell in enumerate(row):
self.cell(widths[i], 5, str(cell), 1, 0, 'L' if i==0 else 'C')
self.ln()
if bold: self.set_font('Helvetica', '', 7.5)
self.ln(2); self.rx()
def figure(self, path, caption, width=140):
self.rx(); self.ln(3)
x = (self.w - width) / 2
self.image(path, x=x, w=width)
self.ln(2)
self.set_font('Helvetica', 'I', 8)
self.multi_cell(0, 4, caption, 0, 'C')
self.ln(2); self.rx()
def code_block(self, code):
self.rx()
self.set_font('Courier', '', 7)
self.set_fill_color(245, 245, 245)
for line in code.strip().split('\n'):
self.set_x(self.l_margin + 5)
self.cell(0, 3.8, line, 0, 1, 'L', True)
self.set_font('Helvetica', '', 9)
self.ln(2); self.rx()
def references(self, refs):
self.rx(); self.ln(3)
self.set_font('Helvetica', 'B', 9)
self.cell(0, 5, 'REFERENCES', 0, 1, 'C')
self.ln(1)
self.set_font('Helvetica', '', 7)
for i, ref in enumerate(refs, 1):
self.set_x(self.l_margin)
self.multi_cell(0, 3.5, f'[{i}] {ref}', 0, 'J')
self.ln(0.5)
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# BUILD THE UPDATED PAPER
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
pdf = IEEEPaper()
pdf.add_page()
# ββ Title ββ
pdf.set_font('Helvetica', 'B', 17)
pdf.multi_cell(0, 8, 'Transformer-Based Social Media Sentiment Analysis:\nA Comprehensive Evaluation of DeBERTa-v3, RoBERTa,\nand BERT on Real Tweet Data', 0, 'C')
pdf.ln(4)
pdf.set_font('Helvetica', '', 12)
pdf.cell(0, 6, 'Raj Vivan', 0, 1, 'C')
pdf.set_font('Helvetica', 'I', 10)
pdf.cell(0, 5, 'Department of Computer Science, Independent Researcher', 0, 1, 'C')
pdf.set_font('Helvetica', '', 9)
pdf.cell(0, 5, 'rajvivan@huggingface.co', 0, 1, 'C')
pdf.ln(5)
# ββ Abstract ββ
pdf.set_font('Helvetica', 'B', 10)
pdf.cell(0, 6, 'Abstract', 0, 1, 'L')
pdf.set_font('Helvetica', 'I', 9)
pdf.multi_cell(0, 4.5, (
"We present a comprehensive evaluation of four transformer models for social media sentiment analysis, "
"tested on real tweet data from the TweetEval benchmark (12,284 tweets, 3-class). DeBERTa-v3-base achieves "
"96.44% accuracy on SST-2 (movie reviews), surpassing the 95% target. However, on real social media data, "
"Twitter-RoBERTa (pre-trained on 124M tweets) significantly outperforms all general-domain models, achieving "
"72.40% macro-F1 on 3-class tweet sentiment versus 40.64% for DeBERTa-v3. This reveals a critical domain gap: "
"models trained on formal text fail to classify neutral tweets, a class comprising 48% of real social media "
"data. We provide confusion matrices, per-class F1 analysis, and dataset distribution visualizations. "
"Our findings demonstrate that domain-specific pre-training is essential for social media NLP, and that "
"benchmark performance on SST-2 does not transfer to real-world tweet classification."
), 0, 'J')
pdf.ln(2)
pdf.set_font('Helvetica', 'B', 9)
pdf.cell(18, 5, 'Keywords:', 0, 0)
pdf.set_font('Helvetica', 'I', 9)
pdf.cell(0, 5, 'sentiment analysis, social media, NLP, DeBERTa, RoBERTa, transformer, TweetEval, Twitter', 0, 1)
pdf.ln(3)
# ββ I. INTRODUCTION ββ
pdf.section('I', 'Introduction')
pdf.text(
"Social media platforms generate vast amounts of user-generated content daily, making automated sentiment "
"analysis essential for brand monitoring, public opinion tracking, and social trend analysis [1]. However, "
"social media text presents unique challenges: informal language, slang, sarcasm, emoticons, and a dominant "
"neutral class that does not exist in traditional sentiment benchmarks [2]."
)
pdf.text(
"Pre-trained transformer models have achieved remarkable accuracy on standard benchmarks. DeBERTa-v3-base [7] "
"achieves 96.44% on SST-2 [8], and BERT-base [4] reaches 92.43%. However, these benchmarks use formal text "
"(movie reviews), raising questions about transferability to real social media. The TweetEval benchmark [9] "
"provides standardized evaluation on actual tweets, where the best models achieve only ~73% macro-recall [9]."
)
pdf.text("This paper makes the following contributions:")
pdf.numbers([
"Comprehensive evaluation of 4 transformer models (DeBERTa-v3, Twitter-RoBERTa, BERT, DistilBERT) on both SST-2 and TweetEval benchmarks using identical evaluation protocols",
"Quantification of the domain gap: 96.44% SST-2 accuracy does NOT transfer to tweets (40.64% macro-F1 on 3-class TweetEval for the same DeBERTa model)",
"Evidence that domain-specific pre-training (Twitter-RoBERTa) is essential, achieving 72.40% macro-F1 on real tweets vs. <41% for all general-domain models",
"Per-class analysis revealing the neutral class as the primary failure mode for general-domain models",
"Open-source release of all code, figures, and results on Hugging Face Hub",
])
# ββ II. RELATED WORK ββ
pdf.section('II', 'Related Work')
pdf.subsection('A. Transformer-Based Sentiment Analysis')
pdf.text(
"BERT [4] established the foundation for transfer learning in NLP. RoBERTa [5] optimized pre-training with "
"larger batches and dynamic masking. DeBERTa-v3 [7] introduced disentangled attention and ELECTRA-style RTD "
"pre-training, achieving 95.6% on SST-2 and state-of-the-art on the GLUE benchmark [12]."
)
pdf.subsection('B. Social Media NLP')
pdf.text(
"Barbieri et al. [9] introduced TweetEval, a unified benchmark for tweet understanding with 7 tasks. On "
"sentiment (3-class), RoBERTa-base achieves 71.3% and Twitter-retrained RoBERTa achieves 72.6% macro-recall. "
"BERTweet [16], pre-trained on 850M tweets, achieves competitive results. Burnham et al. [13] showed that "
"fine-tuned small models beat GPT-4 on tweet classification (94% vs 87%)."
)
# ββ III. METHODOLOGY ββ
pdf.section('III', 'Methodology')
pdf.subsection('A. Models Evaluated')
pdf.text(
"We evaluate four models spanning different sizes and pre-training domains:"
)
pdf.table(
"TABLE I: Models Evaluated",
["Model", "Parameters", "Pre-training Domain", "Output Classes"],
[
["DistilBERT-SST2", "66M", "Wikipedia + BookCorpus", "Binary (pos/neg)"],
["BERT-base-SST2", "110M", "Wikipedia + BookCorpus", "Binary (pos/neg)"],
["Twitter-RoBERTa", "125M", "124M tweets (2018-2021)", "3-class (neg/neu/pos)"],
["DeBERTa-v3-base-SST2", "184M", "Wikipedia + BookCorpus", "Binary (pos/neg)"],
],
[35, 22, 45, 38]
)
pdf.subsection('B. Datasets')
pdf.text(
"TweetEval Sentiment [9]: 45,615 training, 2,000 validation, and 12,284 test real tweets labeled as "
"negative (0), neutral (1), or positive (2). Class distribution is imbalanced: 48.3% neutral, 32.3% negative, "
"19.3% positive. SST-2 [8]: 67,349 training and 872 validation movie review sentences (binary: positive/negative)."
)
# Figure: Dataset Distribution
pdf.figure('/app/figures/fig4_data_distribution.png',
'Fig. 1. Dataset class distributions. TweetEval (left) shows heavy neutral dominance (48.3%). '
'SST-2 (right) has no neutral class, explaining the domain mismatch.', width=155)
pdf.subsection('C. Twitter Preprocessing')
pdf.text(
"Following TimeLM [9]: @mentions replaced with @user, URLs replaced with http, hashtags and emojis preserved. "
"Maximum sequence length: 128 tokens for all models."
)
pdf.subsection('D. Evaluation Protocol')
pdf.text(
"All models evaluated on the same test sets with identical preprocessing. Metrics: Accuracy, Macro-F1, "
"Weighted Precision, Weighted Recall. For binary models on 3-class TweetEval, positive maps to class 2 "
"and negative to class 0 (neutral tweets cannot be predicted). We report both 3-class and binary "
"(neutral excluded) evaluations for completeness."
)
# ββ IV. EXPERIMENTAL RESULTS ββ
pdf.section('IV', 'Experimental Results')
pdf.subsection('A. Main Results')
pdf.table(
"TABLE II: Comprehensive Results on Both Benchmarks",
["Model", "SST-2 Acc", "TweetEval Acc", "TweetEval Macro-F1"],
[
["DistilBERT (66M)", "91.06%", "42.01%", "36.53%"],
["BERT-base (110M)", "92.43%", "44.21%", "38.53%"],
["**Twitter-RoBERTa (125M)**", "**86.12%**", "**72.18%**", "**72.40%**"],
["DeBERTa-v3 (184M)", "96.44%", "46.90%", "40.64%"],
],
[40, 30, 30, 35]
)
pdf.text(
"Key finding: DeBERTa-v3-base achieves the highest SST-2 accuracy (96.44%) but the LOWEST real social media "
"performance among evaluated models. Twitter-RoBERTa, despite lower SST-2 accuracy (86.12%), achieves "
"72.40% macro-F1 on TweetEval -- nearly 2x better than DeBERTa-v3 on real tweets. This demonstrates that "
"SST-2 performance is NOT predictive of social media sentiment analysis capability."
)
# Figure: Model Comparison
pdf.figure('/app/figures/fig3_model_comparison.png',
'Fig. 2. Model comparison: SST-2 accuracy vs TweetEval macro-F1. The red dashed line marks the 95% '
'target. DeBERTa-v3 exceeds it on SST-2 but fails on real tweets. Twitter-RoBERTa shows the opposite pattern.',
width=155)
pdf.subsection('B. The Domain Gap Problem')
pdf.text(
"The 55.8 percentage-point gap between DeBERTa-v3's SST-2 accuracy (96.44%) and TweetEval 3-class accuracy "
"(40.64% macro-F1) is striking. This gap arises because: (1) SST-2 has no neutral class, while 48.3% of "
"TweetEval is neutral; (2) Movie reviews use formal sentiment markers, while tweets use slang and sarcasm; "
"(3) Binary models assign ALL neutral tweets to either positive or negative, causing massive misclassification."
)
pdf.text(
"When neutral tweets are excluded (binary evaluation), DeBERTa-v3 recovers to 90.77% accuracy on tweets, "
"confirming that the neutral class -- not sentiment detection itself -- is the primary failure mode."
)
pdf.subsection('C. Confusion Matrix Analysis')
# Figure: Confusion matrices
pdf.figure('/app/figures/fig1_confusion_roberta.png',
'Fig. 3. Confusion matrix for Twitter-RoBERTa on TweetEval. The model correctly identifies all three '
'classes with reasonable accuracy, though neutral tweets show some confusion with both positive and negative.',
width=110)
pdf.figure('/app/figures/fig2_confusion_deberta.png',
'Fig. 4. Confusion matrix for DeBERTa-v3-base on TweetEval. The binary model cannot predict neutral '
'(middle row), assigning all neutral tweets to either negative or positive.',
width=110)
pdf.subsection('D. Per-Class Performance')
pdf.figure('/app/figures/fig5_per_class_f1.png',
'Fig. 5. Per-class F1 scores on TweetEval. Twitter-RoBERTa achieves balanced performance across all '
'three classes. DeBERTa-v3 scores 0% F1 on neutral (cannot predict this class) but competitive on pos/neg.',
width=130)
pdf.text(
"Fig. 5 reveals that DeBERTa-v3 achieves competitive negative and positive F1 when those classes are present, "
"but its complete inability to predict neutral tweets (0.0% F1) devastates overall macro-F1. Twitter-RoBERTa's "
"balanced per-class performance demonstrates the value of domain-matched pre-training."
)
# ββ V. ANALYSIS AND DISCUSSION ββ
pdf.section('V', 'Analysis and Discussion')
pdf.subsection('A. Why Domain-Specific Pre-training Matters')
pdf.text(
"Twitter-RoBERTa was pre-trained on 124M tweets from 2018-2021, exposing it to informal language patterns, "
"abbreviations (@, #, RT), emoticons, and the neutral-leaning distribution of social media. General-domain "
"models (BERT, DeBERTa) were pre-trained on Wikipedia and BookCorpus, which contain no tweets and no neutral "
"sentiment class. This pre-training domain mismatch is the dominant factor -- model architecture and size "
"matter far less than pre-training data for this task."
)
pdf.subsection('B. The Neutral Class Challenge')
pdf.text(
"The neutral class represents 48.3% of TweetEval test data. Most tweets are informational, factual, or "
"mixed-sentiment -- not clearly positive or negative. Binary models fundamentally cannot address this. "
"Even Twitter-RoBERTa's 72.40% macro-F1 shows room for improvement on neutral detection, which remains "
"the hardest class in social media sentiment analysis."
)
pdf.subsection('C. Practical Recommendations')
pdf.bullets([
"For general sentiment (formal text): DeBERTa-v3-base at 96.44% SST-2 accuracy",
"For social media sentiment: Twitter-RoBERTa at 72.40% TweetEval macro-F1",
"For production systems: Consider ensemble of both models or fine-tune DeBERTa-v3 on TweetEval training data",
"Always evaluate on in-domain data -- SST-2 results do NOT predict social media performance",
])
pdf.subsection('D. Limitations')
pdf.bullets([
"Binary models evaluated on 3-class task (inherent disadvantage on neutral class)",
"Single seed evaluation (no confidence intervals)",
"English only -- results may not generalize to other languages",
"No fine-tuning of DeBERTa on TweetEval (future work)",
"No ablation on pre-training data size effects",
])
# ββ VI. CONCLUSION ββ
pdf.section('VI', 'Conclusion and Future Work')
pdf.text(
"We presented a comprehensive evaluation of four transformer models on real social media sentiment data. "
"Our key finding is that SST-2 accuracy (the standard benchmark) does NOT predict social media performance. "
"DeBERTa-v3-base achieves 96.44% on SST-2 but only 40.64% macro-F1 on real tweets, while Twitter-RoBERTa "
"(86.12% SST-2) achieves 72.40% on tweets -- nearly 2x better. The neutral class, comprising 48.3% of real "
"tweets, is the primary failure mode for general-domain models."
)
pdf.text("Future work includes:")
pdf.numbers([
"Fine-tuning DeBERTa-v3 on TweetEval training data (expected to significantly close the domain gap)",
"Ensemble approaches combining DeBERTa-v3 and Twitter-RoBERTa",
"Multi-seed evaluation with statistical significance tests",
"Aspect-based sentiment analysis for social media",
"Multilingual tweet sentiment analysis",
])
# ββ Reproducibility ββ
pdf.section(None, 'Reproducibility')
pdf.text("All code, data, figures, and results:")
pdf.bullets([
"Repository: https://huggingface.co/rajvivan/social-media-sentiment-analysis-paper",
"Datasets: stanfordnlp/sst2, cardiffnlp/tweet_eval (sentiment config)",
"Models: cliang1453/deberta-v3-base-sst2, cardiffnlp/twitter-roberta-base-sentiment-latest, textattack/bert-base-uncased-SST-2, distilbert/distilbert-base-uncased-finetuned-sst-2-english",
])
# ββ References ββ
pdf.references([
'B. Liu, "Sentiment analysis and opinion mining," Synthesis Lectures on HLT, vol. 5, no. 1, 2012.',
'A. Giachanou and F. Crestani, "Like it or not: A survey of Twitter sentiment analysis," ACM Comp. Surveys, vol. 49, no. 2, 2016.',
'A. Vaswani et al., "Attention is all you need," NeurIPS, vol. 30, 2017.',
'J. Devlin et al., "BERT: Pre-training of deep bidirectional transformers," Proc. NAACL-HLT, 2019.',
'Y. Liu et al., "RoBERTa: A robustly optimized BERT pretraining approach," arXiv:1907.11692, 2019.',
'P. He et al., "DeBERTa: Decoding-enhanced BERT with disentangled attention," Proc. ICLR, 2021.',
'P. He et al., "DeBERTaV3: Improving DeBERTa using ELECTRA-style pre-training with GDES," Proc. ICLR, 2023.',
'R. Socher et al., "Recursive deep models for semantic compositionality," Proc. EMNLP, 2013.',
'F. Barbieri et al., "TweetEval: Unified benchmark and comparative evaluation for tweet classification," Findings of EMNLP, 2020.',
'J. Wei and K. Zou, "EDA: Easy data augmentation for text classification," Proc. EMNLP-IJCNLP, 2019.',
'T. Vu et al., "Stacking ensemble methods for sentiment analysis," arXiv:2009.12357, 2020.',
'A. Wang et al., "GLUE: A multi-task benchmark for NLU," Proc. EMNLP BlackboxNLP, 2018.',
'M. J. Burnham et al., "Fine-tuned small LLMs outperform zero-shot generative AI," arXiv:2406.08660, 2024.',
'S. Ruder, "Neural transfer learning for NLP," Ph.D. thesis, NUI Galway, 2019.',
'"Tweet sentiment extraction," Kaggle, 2020. https://www.kaggle.com/c/tweet-sentiment-extraction',
'D. Q. Nguyen et al., "BERTweet: A pre-trained language model for English tweets," Proc. EMNLP Demos, 2020.',
])
os.makedirs("/app/paper", exist_ok=True)
pdf.output("/app/paper/Social_Media_Sentiment_Analysis_IEEE_v2.pdf")
print(f"Updated PDF generated: /app/paper/Social_Media_Sentiment_Analysis_IEEE_v2.pdf")
print(f"Pages: {pdf.page_no()}")
|