Spaces:

destroyer795
/

Sentiment-Analyzer-Extension

Sleeping

App Files Files Community

destroyer795 commited on Dec 30, 2025

Commit

7c08782

1 Parent(s): 43f0440

Initial deploy of DistilBERT model with Docker

Browse files

Files changed (11) hide show

.gitattributes +2 -0
Dockerfile +7 -8
SA_model/config.json +32 -0
sentiment_model.pth → SA_model/model.safetensors +2 -2
SA_model/special_tokens_map.json +7 -0
SA_model/tokenizer.json +0 -0
SA_model/tokenizer_config.json +56 -0
word_vectors.kv → SA_model/training_args.bin +2 -2
SA_model/vocab.txt +0 -0
app.py +32 -144
requirements.txt +5 -8

.gitattributes CHANGED Viewed

@@ -1,2 +1,4 @@
 *.pth filter=lfs diff=lfs merge=lfs -text
 *.kv filter=lfs diff=lfs merge=lfs -text

 *.pth filter=lfs diff=lfs merge=lfs -text
 *.kv filter=lfs diff=lfs merge=lfs -text
+SA_model/model.safetensors filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text

Dockerfile CHANGED Viewed

@@ -1,24 +1,23 @@
-# Use a standard Python 3.11 base image
 FROM python:3.11-slim
 # Set the working directory
 WORKDIR /app
-# Copy and install requirements as the root user.
-# This ensures gunicorn is installed in a globally accessible system path.
 COPY ./requirements.txt .
 RUN pip install --no-cache-dir --upgrade -r requirements.txt
-# Download the NLTK data during the build to a shared directory.
-RUN python -m nltk.downloader -d /usr/share/nltk_data punkt wordnet stopwords
-# Tell the application where to find the data.
-ENV NLTK_DATA=/usr/share/nltk_data
 # Copy the rest of your application code and models
 COPY . .
 # Expose the correct port for Hugging Face Spaces
 EXPOSE 7860
-# Run the command directly. As root, gunicorn will be in the default system PATH.
 CMD ["gunicorn", "--bind", "0.0.0.0:7860", "app:app"]

+# Using a standard Python 3.11 base image
 FROM python:3.11-slim
 # Set the working directory
 WORKDIR /app
+# Copy and install requirements
+# This ensures all dependencies (Flask, Transformers, Torch, Gunicorn) are installed.
 COPY ./requirements.txt .
 RUN pip install --no-cache-dir --upgrade -r requirements.txt
+# Transformers use subword tokenization, so punkt, wordnet, and stopwords are no longer needed.
 # Copy the rest of your application code and models
+# This includes your 'app.py' and the 'SA_model' folder.
 COPY . .
 # Expose the correct port for Hugging Face Spaces
 EXPOSE 7860
+# Run the application using Gunicorn
+# This is the industry standard for serving Flask apps in production.
 CMD ["gunicorn", "--bind", "0.0.0.0:7860", "app:app"]

SA_model/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForSequenceClassification"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "dtype": "float32",
+  "hidden_dim": 3072,
+  "id2label": {
+    "0": "NEGATIVE",
+    "1": "POSITIVE"
+  },
+  "initializer_range": 0.02,
+  "label2id": {
+    "NEGATIVE": 0,
+    "POSITIVE": 1
+  },
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "problem_type": "single_label_classification",
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "transformers_version": "4.57.3",
+  "vocab_size": 30522
+}

sentiment_model.pth → SA_model/model.safetensors RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:49d76a5d941660981152277b3ffefdfcde4180d7e3847a4e69a30bfbd71394f8
-size 12256208

 version https://git-lfs.github.com/spec/v1
+oid sha256:6220a6c2266a1b6c7da3bf162edb758fe1e5ddbfd3bf324c2109ad1344257f11
+size 267832560

SA_model/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

SA_model/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

SA_model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "DistilBertTokenizer",
+  "unk_token": "[UNK]"
+}

word_vectors.kv → SA_model/training_args.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cbb43596f632683b363d1d12ba1796afd2ad25d3d2d85e944e509a72cd51d738
-size 3231247

 version https://git-lfs.github.com/spec/v1
+oid sha256:2179dc78f0e4e7a1a23648e4b57ef62308d7f8eb9a6d6d6f697e66945c10574a
+size 5777

SA_model/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

app.py CHANGED Viewed

@@ -1,110 +1,25 @@
 from flask import Flask, request, jsonify
 import torch
-import torch.nn as nn
-from gensim.models import KeyedVectors
-import nltk
-from nltk.stem import WordNetLemmatizer
-from nltk.tokenize import word_tokenize
-import re
-from nltk.corpus import stopwords as nltk_stopwords
-from spellchecker import SpellChecker
-# --- 1. MODEL CLASS DEFINITION (Unchanged) ---
-class RNN(nn.Module):
-    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, embedding_weights):
-        super().__init__()
-        self.embedding = nn.Embedding.from_pretrained(embedding_weights, padding_idx=0)
-        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
-        self.fc = nn.Linear(hidden_dim * 2, output_dim)
-        self.dropout = nn.Dropout(dropout)
-    def forward(self, text, text_lengths):
-        embedded = self.embedding(text)
-        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'))
-        packed_output, (hidden, cell) = self.rnn(packed_embedded)
-        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
-        return self.fc(hidden)
-# --- 2. GLOBAL VARIABLES & PREPROCESSING SETUP ---
-# We keep negation words out of the stop words list
-stop_words = set(nltk_stopwords.words('english'))
-negation_words = {
-    'not', 'no', 'nor', 'never', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't",
-    'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't",
-    'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't",
-    'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", "can't"
-}
-stop_words = stop_words - negation_words
-lemmatizer = WordNetLemmatizer()
-spell = SpellChecker()
-def preprocess_text(text):
-    # Expand contractions first
-    text = re.sub(r"it's", "it is", text)
-    text = re.sub(r"i'm", "i am", text)
-    text = re.sub(r"he's", "he is", text)
-    text = re.sub(r"she's", "she is", text)
-    text = re.sub(r"we're", "we are", text)
-    text = re.sub(r"they're", "they are", text)
-    text = re.sub(r"you're", "you are", text)
-    text = re.sub(r"that's", "that is", text)
-    text = re.sub(r"what's", "what is", text)
-    text = re.sub(r"where's", "where is", text)
-    text = re.sub(r"\'ll", " will", text)
-    text = re.sub(r"\'ve", " have", text)
-    text = re.sub(r"\'re", " are", text)
-    text = re.sub(r"\'d", " would", text)
-    text = re.sub(r"won't", "will not", text)
-    text = re.sub(r"can't", "cannot", text)
-    text = re.sub(r"n't", " not", text)
-    # Remove HTML tags and non-alphabetic characters
-    text = re.sub(r'<.*?>', '', text)
-    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
-    # Tokenize, lemmatize, and remove stopwords
-    tokens = word_tokenize(text)
-    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
-    return lemmatized_tokens
-# --- 3. MODEL LOADING ---
-print("Loading models...")
-device = torch.device('cpu')
-word_vectors = KeyedVectors.load('word_vectors.kv')
-embedding_weights = torch.FloatTensor(word_vectors.vectors)
-INPUT_DIM, EMBEDDING_DIM = embedding_weights.shape
-HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, DROPOUT = 256, 1, 2, 0.5
-BIDIRECTIONAL = True
-model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT, embedding_weights)
-model.load_state_dict(torch.load('sentiment_model.pth', map_location=device))
-model.to(device)
-model.eval()
-print("Models loaded successfully!")
 app = Flask(__name__)
-# --- 4. PREDICTION FUNCTION ---
-def predict_sentiment(sentence):
-    model.eval()
-    preprocessed_tokens = preprocess_text(sentence)
-    if preprocessed_tokens:
-        misspelled = spell.unknown(preprocessed_tokens)
-        final_tokens = [(spell.correction(word) or word) if word in misspelled else word for word in preprocessed_tokens]
-    else:
-        final_tokens = []
-    if not final_tokens: return 0.5
-    indexed = [word_vectors.key_to_index.get(t, -1) for t in final_tokens]
-    indexed = [i for i in indexed if i != -1]
-    if not indexed: return 0.5
-    length = torch.LongTensor([len(indexed)])
-    tensor = torch.LongTensor(indexed).to(device).unsqueeze(1)
-    prediction = torch.sigmoid(model(tensor, length))
-    return prediction.item()
-# --- 5. FLASK ENDPOINT WITH CORRECTIVE LOGIC ---
 @app.route('/predict', methods=['POST'])
 def predict_endpoint():
     data = request.get_json()
@@ -112,48 +27,21 @@ def predict_endpoint():
         return jsonify({'error': 'No text provided'}), 400
     sentence = data['text']
-    # Get the model's raw, initial prediction
-    score = predict_sentiment(sentence)
-    # **THE FIX: CONTEXTUAL NEGATION FLIPPER**
-    # This logic checks if a negation word exists in the original sentence.
-    # If it does, and the model still predicted a positive score,
-    # it intelligently flips the score to reflect the negative context.
-    # We check the raw sentence before preprocessing to catch words like "can't".
-    words_in_sentence = set(re.findall(r"[\w']+", sentence.lower()))
-    if any(word in negation_words for word in words_in_sentence):
-        # If the model's score is positive (> 0.5), we invert it.
-        # e.g., a score of 0.8 (positive) becomes 1.0 - 0.8 = 0.2 (negative)
-        if score > 0.5:
-            score = 1.0 - score
-    # **END FIX**
-    sentiment = 'Positive' if score > 0.6 else 'Negative' if score < 0.4 else 'Neutral'
-    return jsonify({'sentiment': sentiment, 'score': score})
-# --- 6. LOCAL TESTING BLOCK (To run this script directly for testing) ---
-if __name__ == '__main__':
-    print("\n--- Running Local Tests ---")
-    test_sentence_1 = "its not good"
-    score_1 = predict_sentiment(test_sentence_1)
-    if any(word in negation_words for word in set(re.findall(r"[\w']+", test_sentence_1.lower()))):
-        if score_1 > 0.5: score_1 = 1.0 - score_1
-    print(f"Sentence: '{test_sentence_1}' | Final Score: {score_1:.4f} | Sentiment: {'Positive' if score_1 > 0.6 else 'Negative' if score_1 < 0.4 else 'Neutral'}")
-    test_sentence_2 = "This movie was absolutely amazing"
-    score_2 = predict_sentiment(test_sentence_2)
-    if any(word in negation_words for word in set(re.findall(r"[\w']+", test_sentence_2.lower()))):
-        if score_2 > 0.5: score_2 = 1.0 - score_2
-    print(f"Sentence: '{test_sentence_2}' | Final Score: {score_2:.4f} | Sentiment: {'Positive' if score_2 > 0.6 else 'Negative' if score_2 < 0.4 else 'Neutral'}")
-    test_sentence_3 = "The service can't be described as fast"
-    score_3 = predict_sentiment(test_sentence_3)
-    if any(word in negation_words for word in set(re.findall(r"[\w']+", test_sentence_3.lower()))):
-        if score_3 > 0.5: score_3 = 1.0 - score_3
-    print(f"Sentence: '{test_sentence_3}' | Final Score: {score_3:.4f} | Sentiment: {'Positive' if score_3 > 0.6 else 'Negative' if score_3 < 0.4 else 'Neutral'}")
-    print("\nTo start the web server, run 'flask run' in your terminal.")
-    # To run the Flask server, uncomment the line below and run `python app.py`
-    # app.run(debug=True)

 from flask import Flask, request, jsonify
+from transformers import pipeline
 import torch
 app = Flask(__name__)
+# MODEL LOADING
+# Replace './SA_model' with the actual path to your model folder.
+model_path = "./SA_model"
+# We use the pipeline API which handles tokenization and inference automatically.
+# device=0 for GPU, but use device=-1 for Hugging Face free CPU spaces.
+print("Loading DistilBERT model...")
+classifier = pipeline(
+    "sentiment-analysis",
+    model=model_path,
+    tokenizer=model_path,
+    device=-1
+)
+print("Model loaded successfully!")
+# FLASK ENDPOINT
 @app.route('/predict', methods=['POST'])
 def predict_endpoint():
     data = request.get_json()
         return jsonify({'error': 'No text provided'}), 400
     sentence = data['text']
+    # Transformer models handle negation (like "not good") automatically
+    # No manual negation flipping is needed anymore.
+    result = classifier(sentence)[0]
+    # result is like: {'label': 'POSITIVE', 'score': 0.99}
+    label = result['label']
+    score = result['score']
+    # For the extension UI, we can normalize the score if needed
+    # (Transformers already give high confidence for clear sentiment)
+    return jsonify({
+        'sentiment': label.capitalize(),
+        'score': score
+    })
+if __name__ == '__main__':
+    app.run(host='0.0.0.0', port=7860) # Port 7860 is standard for HF Spaces

requirements.txt CHANGED Viewed

@@ -1,8 +1,5 @@
-Flask==3.0.3
-gunicorn==22.0.0
-torch==2.8.0
-gensim==4.3.2
-nltk==3.8.1
-pyspellchecker==0.8.1
-numpy==1.26.4
-scipy==1.12.0

+flask
+transformers
+torch
+safetensors
+gunicorn