destroyer795 commited on
Commit
7c08782
·
1 Parent(s): 43f0440

Initial deploy of DistilBERT model with Docker

Browse files
.gitattributes CHANGED
@@ -1,2 +1,4 @@
1
  *.pth filter=lfs diff=lfs merge=lfs -text
2
  *.kv filter=lfs diff=lfs merge=lfs -text
 
 
 
1
  *.pth filter=lfs diff=lfs merge=lfs -text
2
  *.kv filter=lfs diff=lfs merge=lfs -text
3
+ SA_model/model.safetensors filter=lfs diff=lfs merge=lfs -text
4
+ *.bin filter=lfs diff=lfs merge=lfs -text
Dockerfile CHANGED
@@ -1,24 +1,23 @@
1
- # Use a standard Python 3.11 base image
2
  FROM python:3.11-slim
3
 
4
  # Set the working directory
5
  WORKDIR /app
6
 
7
- # Copy and install requirements as the root user.
8
- # This ensures gunicorn is installed in a globally accessible system path.
9
  COPY ./requirements.txt .
10
  RUN pip install --no-cache-dir --upgrade -r requirements.txt
11
 
12
- # Download the NLTK data during the build to a shared directory.
13
- RUN python -m nltk.downloader -d /usr/share/nltk_data punkt wordnet stopwords
14
- # Tell the application where to find the data.
15
- ENV NLTK_DATA=/usr/share/nltk_data
16
 
17
  # Copy the rest of your application code and models
 
18
  COPY . .
19
 
20
  # Expose the correct port for Hugging Face Spaces
21
  EXPOSE 7860
22
 
23
- # Run the command directly. As root, gunicorn will be in the default system PATH.
 
24
  CMD ["gunicorn", "--bind", "0.0.0.0:7860", "app:app"]
 
1
+ # Using a standard Python 3.11 base image
2
  FROM python:3.11-slim
3
 
4
  # Set the working directory
5
  WORKDIR /app
6
 
7
+ # Copy and install requirements
8
+ # This ensures all dependencies (Flask, Transformers, Torch, Gunicorn) are installed.
9
  COPY ./requirements.txt .
10
  RUN pip install --no-cache-dir --upgrade -r requirements.txt
11
 
12
+ # Transformers use subword tokenization, so punkt, wordnet, and stopwords are no longer needed.
 
 
 
13
 
14
  # Copy the rest of your application code and models
15
+ # This includes your 'app.py' and the 'SA_model' folder.
16
  COPY . .
17
 
18
  # Expose the correct port for Hugging Face Spaces
19
  EXPOSE 7860
20
 
21
+ # Run the application using Gunicorn
22
+ # This is the industry standard for serving Flask apps in production.
23
  CMD ["gunicorn", "--bind", "0.0.0.0:7860", "app:app"]
SA_model/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation": "gelu",
3
+ "architectures": [
4
+ "DistilBertForSequenceClassification"
5
+ ],
6
+ "attention_dropout": 0.1,
7
+ "dim": 768,
8
+ "dropout": 0.1,
9
+ "dtype": "float32",
10
+ "hidden_dim": 3072,
11
+ "id2label": {
12
+ "0": "NEGATIVE",
13
+ "1": "POSITIVE"
14
+ },
15
+ "initializer_range": 0.02,
16
+ "label2id": {
17
+ "NEGATIVE": 0,
18
+ "POSITIVE": 1
19
+ },
20
+ "max_position_embeddings": 512,
21
+ "model_type": "distilbert",
22
+ "n_heads": 12,
23
+ "n_layers": 6,
24
+ "pad_token_id": 0,
25
+ "problem_type": "single_label_classification",
26
+ "qa_dropout": 0.1,
27
+ "seq_classif_dropout": 0.2,
28
+ "sinusoidal_pos_embds": false,
29
+ "tie_weights_": true,
30
+ "transformers_version": "4.57.3",
31
+ "vocab_size": 30522
32
+ }
sentiment_model.pth → SA_model/model.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:49d76a5d941660981152277b3ffefdfcde4180d7e3847a4e69a30bfbd71394f8
3
- size 12256208
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6220a6c2266a1b6c7da3bf162edb758fe1e5ddbfd3bf324c2109ad1344257f11
3
+ size 267832560
SA_model/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
SA_model/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
SA_model/tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "pad_token": "[PAD]",
51
+ "sep_token": "[SEP]",
52
+ "strip_accents": null,
53
+ "tokenize_chinese_chars": true,
54
+ "tokenizer_class": "DistilBertTokenizer",
55
+ "unk_token": "[UNK]"
56
+ }
word_vectors.kv → SA_model/training_args.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cbb43596f632683b363d1d12ba1796afd2ad25d3d2d85e944e509a72cd51d738
3
- size 3231247
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2179dc78f0e4e7a1a23648e4b57ef62308d7f8eb9a6d6d6f697e66945c10574a
3
+ size 5777
SA_model/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
app.py CHANGED
@@ -1,110 +1,25 @@
1
  from flask import Flask, request, jsonify
 
2
  import torch
3
- import torch.nn as nn
4
- from gensim.models import KeyedVectors
5
- import nltk
6
- from nltk.stem import WordNetLemmatizer
7
- from nltk.tokenize import word_tokenize
8
- import re
9
- from nltk.corpus import stopwords as nltk_stopwords
10
- from spellchecker import SpellChecker
11
-
12
- # --- 1. MODEL CLASS DEFINITION (Unchanged) ---
13
- class RNN(nn.Module):
14
- def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, embedding_weights):
15
- super().__init__()
16
- self.embedding = nn.Embedding.from_pretrained(embedding_weights, padding_idx=0)
17
- self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
18
- self.fc = nn.Linear(hidden_dim * 2, output_dim)
19
- self.dropout = nn.Dropout(dropout)
20
- def forward(self, text, text_lengths):
21
- embedded = self.embedding(text)
22
- packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'))
23
- packed_output, (hidden, cell) = self.rnn(packed_embedded)
24
- hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
25
- return self.fc(hidden)
26
-
27
- # --- 2. GLOBAL VARIABLES & PREPROCESSING SETUP ---
28
- # We keep negation words out of the stop words list
29
- stop_words = set(nltk_stopwords.words('english'))
30
- negation_words = {
31
- 'not', 'no', 'nor', 'never', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't",
32
- 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't",
33
- 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't",
34
- 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", "can't"
35
- }
36
- stop_words = stop_words - negation_words
37
- lemmatizer = WordNetLemmatizer()
38
- spell = SpellChecker()
39
-
40
- def preprocess_text(text):
41
- # Expand contractions first
42
- text = re.sub(r"it's", "it is", text)
43
- text = re.sub(r"i'm", "i am", text)
44
- text = re.sub(r"he's", "he is", text)
45
- text = re.sub(r"she's", "she is", text)
46
- text = re.sub(r"we're", "we are", text)
47
- text = re.sub(r"they're", "they are", text)
48
- text = re.sub(r"you're", "you are", text)
49
- text = re.sub(r"that's", "that is", text)
50
- text = re.sub(r"what's", "what is", text)
51
- text = re.sub(r"where's", "where is", text)
52
- text = re.sub(r"\'ll", " will", text)
53
- text = re.sub(r"\'ve", " have", text)
54
- text = re.sub(r"\'re", " are", text)
55
- text = re.sub(r"\'d", " would", text)
56
- text = re.sub(r"won't", "will not", text)
57
- text = re.sub(r"can't", "cannot", text)
58
- text = re.sub(r"n't", " not", text)
59
-
60
- # Remove HTML tags and non-alphabetic characters
61
- text = re.sub(r'<.*?>', '', text)
62
- text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
63
-
64
- # Tokenize, lemmatize, and remove stopwords
65
- tokens = word_tokenize(text)
66
- lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
67
-
68
- return lemmatized_tokens
69
-
70
- # --- 3. MODEL LOADING ---
71
- print("Loading models...")
72
- device = torch.device('cpu')
73
- word_vectors = KeyedVectors.load('word_vectors.kv')
74
- embedding_weights = torch.FloatTensor(word_vectors.vectors)
75
- INPUT_DIM, EMBEDDING_DIM = embedding_weights.shape
76
- HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, DROPOUT = 256, 1, 2, 0.5
77
- BIDIRECTIONAL = True
78
- model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT, embedding_weights)
79
- model.load_state_dict(torch.load('sentiment_model.pth', map_location=device))
80
- model.to(device)
81
- model.eval()
82
- print("Models loaded successfully!")
83
 
84
  app = Flask(__name__)
85
 
86
- # --- 4. PREDICTION FUNCTION ---
87
- def predict_sentiment(sentence):
88
- model.eval()
89
- preprocessed_tokens = preprocess_text(sentence)
90
- if preprocessed_tokens:
91
- misspelled = spell.unknown(preprocessed_tokens)
92
- final_tokens = [(spell.correction(word) or word) if word in misspelled else word for word in preprocessed_tokens]
93
- else:
94
- final_tokens = []
95
-
96
- if not final_tokens: return 0.5
97
- indexed = [word_vectors.key_to_index.get(t, -1) for t in final_tokens]
98
- indexed = [i for i in indexed if i != -1]
99
- if not indexed: return 0.5
100
-
101
- length = torch.LongTensor([len(indexed)])
102
- tensor = torch.LongTensor(indexed).to(device).unsqueeze(1)
103
- prediction = torch.sigmoid(model(tensor, length))
104
-
105
- return prediction.item()
106
-
107
- # --- 5. FLASK ENDPOINT WITH CORRECTIVE LOGIC ---
108
  @app.route('/predict', methods=['POST'])
109
  def predict_endpoint():
110
  data = request.get_json()
@@ -112,48 +27,21 @@ def predict_endpoint():
112
  return jsonify({'error': 'No text provided'}), 400
113
 
114
  sentence = data['text']
115
- # Get the model's raw, initial prediction
116
- score = predict_sentiment(sentence)
117
-
118
- # **THE FIX: CONTEXTUAL NEGATION FLIPPER**
119
- # This logic checks if a negation word exists in the original sentence.
120
- # If it does, and the model still predicted a positive score,
121
- # it intelligently flips the score to reflect the negative context.
122
- # We check the raw sentence before preprocessing to catch words like "can't".
123
- words_in_sentence = set(re.findall(r"[\w']+", sentence.lower()))
124
- if any(word in negation_words for word in words_in_sentence):
125
- # If the model's score is positive (> 0.5), we invert it.
126
- # e.g., a score of 0.8 (positive) becomes 1.0 - 0.8 = 0.2 (negative)
127
- if score > 0.5:
128
- score = 1.0 - score
129
- # **END FIX**
130
-
131
- sentiment = 'Positive' if score > 0.6 else 'Negative' if score < 0.4 else 'Neutral'
132
 
133
- return jsonify({'sentiment': sentiment, 'score': score})
134
-
135
- # --- 6. LOCAL TESTING BLOCK (To run this script directly for testing) ---
136
- if __name__ == '__main__':
137
- print("\n--- Running Local Tests ---")
138
 
139
- test_sentence_1 = "its not good"
140
- score_1 = predict_sentiment(test_sentence_1)
141
- if any(word in negation_words for word in set(re.findall(r"[\w']+", test_sentence_1.lower()))):
142
- if score_1 > 0.5: score_1 = 1.0 - score_1
143
- print(f"Sentence: '{test_sentence_1}' | Final Score: {score_1:.4f} | Sentiment: {'Positive' if score_1 > 0.6 else 'Negative' if score_1 < 0.4 else 'Neutral'}")
144
-
145
- test_sentence_2 = "This movie was absolutely amazing"
146
- score_2 = predict_sentiment(test_sentence_2)
147
- if any(word in negation_words for word in set(re.findall(r"[\w']+", test_sentence_2.lower()))):
148
- if score_2 > 0.5: score_2 = 1.0 - score_2
149
- print(f"Sentence: '{test_sentence_2}' | Final Score: {score_2:.4f} | Sentiment: {'Positive' if score_2 > 0.6 else 'Negative' if score_2 < 0.4 else 'Neutral'}")
150
-
151
- test_sentence_3 = "The service can't be described as fast"
152
- score_3 = predict_sentiment(test_sentence_3)
153
- if any(word in negation_words for word in set(re.findall(r"[\w']+", test_sentence_3.lower()))):
154
- if score_3 > 0.5: score_3 = 1.0 - score_3
155
- print(f"Sentence: '{test_sentence_3}' | Final Score: {score_3:.4f} | Sentiment: {'Positive' if score_3 > 0.6 else 'Negative' if score_3 < 0.4 else 'Neutral'}")
156
 
157
- print("\nTo start the web server, run 'flask run' in your terminal.")
158
- # To run the Flask server, uncomment the line below and run `python app.py`
159
- # app.run(debug=True)
 
 
 
 
 
 
 
1
  from flask import Flask, request, jsonify
2
+ from transformers import pipeline
3
  import torch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  app = Flask(__name__)
6
 
7
+ # MODEL LOADING
8
+ # Replace './SA_model' with the actual path to your model folder.
9
+ model_path = "./SA_model"
10
+
11
+ # We use the pipeline API which handles tokenization and inference automatically.
12
+ # device=0 for GPU, but use device=-1 for Hugging Face free CPU spaces.
13
+ print("Loading DistilBERT model...")
14
+ classifier = pipeline(
15
+ "sentiment-analysis",
16
+ model=model_path,
17
+ tokenizer=model_path,
18
+ device=-1
19
+ )
20
+ print("Model loaded successfully!")
21
+
22
+ # FLASK ENDPOINT
 
 
 
 
 
 
23
  @app.route('/predict', methods=['POST'])
24
  def predict_endpoint():
25
  data = request.get_json()
 
27
  return jsonify({'error': 'No text provided'}), 400
28
 
29
  sentence = data['text']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
+ # Transformer models handle negation (like "not good") automatically
32
+ # No manual negation flipping is needed anymore.
33
+ result = classifier(sentence)[0]
 
 
34
 
35
+ # result is like: {'label': 'POSITIVE', 'score': 0.99}
36
+ label = result['label']
37
+ score = result['score']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
+ # For the extension UI, we can normalize the score if needed
40
+ # (Transformers already give high confidence for clear sentiment)
41
+ return jsonify({
42
+ 'sentiment': label.capitalize(),
43
+ 'score': score
44
+ })
45
+
46
+ if __name__ == '__main__':
47
+ app.run(host='0.0.0.0', port=7860) # Port 7860 is standard for HF Spaces
requirements.txt CHANGED
@@ -1,8 +1,5 @@
1
- Flask==3.0.3
2
- gunicorn==22.0.0
3
- torch==2.8.0
4
- gensim==4.3.2
5
- nltk==3.8.1
6
- pyspellchecker==0.8.1
7
- numpy==1.26.4
8
- scipy==1.12.0
 
1
+ flask
2
+ transformers
3
+ torch
4
+ safetensors
5
+ gunicorn