Axelottle commited on
Commit
c476330
·
1 Parent(s): 10cc91a

initial commit

Browse files
Files changed (3) hide show
  1. README.md +2 -8
  2. app.py +308 -0
  3. long_stopwords.txt +435 -0
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
  title: SnipSnap
3
- emoji: 🏃
4
- colorFrom: red
5
- colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 4.7.1
8
  app_file: app.py
9
- pinned: false
 
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: SnipSnap
 
 
 
 
 
3
  app_file: app.py
4
+ sdk: gradio
5
+ sdk_version: 3.39.0
6
  ---
 
 
app.py ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pip install gradio nltk youtube-transcript-api pytube gtts --quiet
2
+ from __future__ import division
3
+ import nltk
4
+ import string
5
+ import re
6
+ import io, os, time
7
+ import numpy as np
8
+ import gradio as gr
9
+ from tempfile import TemporaryFile
10
+ from gtts import gTTS
11
+ from pytube import YouTube
12
+ from youtube_transcript_api import YouTubeTranscriptApi
13
+ from nltk import word_tokenize
14
+ from nltk.stem import WordNetLemmatizer
15
+ from collections import defaultdict
16
+
17
+ nltk.download('punkt')
18
+ nltk.download('averaged_perceptron_tagger')
19
+ nltk.download('wordnet')
20
+
21
+ """## Transcript Summary Module"""
22
+
23
+ def summarize_text(url, percent):
24
+
25
+ # Check if the URL is valid
26
+ try:
27
+ youtube = YouTube(url)
28
+ except Exception as e:
29
+ raise gr.Error(f"Invalid YouTube URL")
30
+
31
+ # Get transcript using youtube-transcript-api
32
+ try:
33
+ transcript = YouTubeTranscriptApi.get_transcript(youtube.video_id)
34
+ Text = ' '.join([entry['text'] for entry in transcript])
35
+ except Exception as e:
36
+ raise gr.Error(f"Could not retrieve the video's transcript. Please try another video")
37
+
38
+ # Clean text
39
+
40
+ Cleaned_text = re.sub(r'[^a-zA-Z0-9\._-]', ' ', Text)
41
+ text = word_tokenize(Cleaned_text)
42
+ case_insensitive_text = word_tokenize(Cleaned_text.lower())
43
+
44
+ # Sentence Segmentation
45
+
46
+ sentences = []
47
+ tokenized_sentences = []
48
+ sentence = " "
49
+ for word in text:
50
+ if word != '.':
51
+ sentence+=str(word)+" "
52
+ else:
53
+ sentences.append(sentence.strip())
54
+ tokenized_sentences.append(word_tokenize(sentence.lower().strip()))
55
+ sentence = " "
56
+
57
+ def lemmatize(POS_tagged_text):
58
+
59
+ wordnet_lemmatizer = WordNetLemmatizer()
60
+ adjective_tags = ['JJ','JJR','JJS']
61
+ lemmatized_text = []
62
+
63
+ for word in POS_tagged_text:
64
+ if word[1] in adjective_tags:
65
+ lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0],pos="a")))
66
+ else:
67
+ lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0]))) #default POS = noun
68
+
69
+ return lemmatized_text
70
+
71
+
72
+ #Pre_processing:
73
+
74
+ POS_tagged_text = nltk.pos_tag(case_insensitive_text)
75
+ lemmatized_text = lemmatize(POS_tagged_text)
76
+ Processed_text = nltk.pos_tag(lemmatized_text)
77
+
78
+ def generate_stopwords(POS_tagged_text):
79
+ stopwords = []
80
+
81
+ wanted_POS = ['NN','NNS','NNP','NNPS','JJ','JJR','JJS','FW'] #may be add VBG too
82
+
83
+ for word in POS_tagged_text:
84
+ if word[1] not in wanted_POS:
85
+ stopwords.append(word[0])
86
+
87
+ punctuations = list(str(string.punctuation))
88
+ stopwords = stopwords + punctuations
89
+
90
+ stopword_file = open("long_stopwords.txt", "r")
91
+ #Source = https://www.ranks.nl/stopwords
92
+
93
+ for line in stopword_file.readlines():
94
+ stopwords.append(str(line.strip()))
95
+
96
+ return set(stopwords)
97
+
98
+ stopwords = generate_stopwords(Processed_text)
99
+
100
+ def partition_phrases(text,delimeters):
101
+ phrases = []
102
+ phrase = " "
103
+ for word in text:
104
+ if word in delimeters:
105
+ if phrase!= " ":
106
+ phrases.append(str(phrase).split())
107
+ phrase = " "
108
+ elif word not in delimeters:
109
+ phrase+=str(word)
110
+ phrase+=" "
111
+ return phrases
112
+
113
+ phrase_list = partition_phrases(lemmatized_text,stopwords)
114
+
115
+ phrase_partitioned_sentences = []
116
+
117
+ for sentence in tokenized_sentences:
118
+ POS_tagged_sentence = nltk.pos_tag(sentence)
119
+ lemmatized_sentence = lemmatize(POS_tagged_sentence)
120
+ phrase_partitioned_sentence = partition_phrases(lemmatized_sentence,stopwords)
121
+ phrase_partitioned_sentences.append(phrase_partitioned_sentence)
122
+
123
+ # keyword scoring
124
+
125
+ frequency = defaultdict(int)
126
+ degree = defaultdict(int)
127
+ word_score = defaultdict(float)
128
+
129
+ vocabulary = []
130
+
131
+ for phrase in phrase_list:
132
+ for word in phrase:
133
+ frequency[word]+=1
134
+ degree[word]+=len(phrase)
135
+ if word not in vocabulary:
136
+ vocabulary.append(word)
137
+
138
+ for word in vocabulary:
139
+ word_score[word] = degree[word]/frequency[word]
140
+
141
+ phrase_scores = []
142
+ keywords = []
143
+ phrase_vocabulary = []
144
+
145
+ for phrase in phrase_list:
146
+ if phrase not in phrase_vocabulary:
147
+ phrase_score = 0
148
+ for word in phrase:
149
+ phrase_score += word_score[word]
150
+ phrase_scores.append(phrase_score)
151
+ phrase_vocabulary.append(phrase)
152
+
153
+
154
+ phrase_vocabulary = []
155
+
156
+ for phrase in phrase_list:
157
+ if phrase not in phrase_vocabulary:
158
+ keyword=''
159
+ for word in phrase:
160
+ keyword += str(word)+" "
161
+ phrase_vocabulary.append(phrase)
162
+ keyword = keyword.strip()
163
+ keywords.append(keyword)
164
+
165
+ sorted_index = np.flip(np.argsort(phrase_scores),0)
166
+
167
+ tokenized_keywords = []
168
+ sorted_keywords = []
169
+
170
+ keywords_num = 0
171
+ threshold = 50
172
+ if len(keywords)<threshold:
173
+ keywords_num = len(keywords)
174
+ else:
175
+ keywords_num = threshold
176
+
177
+ for i in range(0,keywords_num):
178
+ sorted_keywords.append(keywords[sorted_index[i]])
179
+ tokenized_keywords.append(sorted_keywords[i].split())
180
+
181
+ sentence_scores = np.zeros((len(sentences)),np.float32)
182
+ i=0
183
+ for sentence in phrase_partitioned_sentences:
184
+ for phrase in sentence:
185
+ if phrase in tokenized_keywords:
186
+
187
+ matched_tokenized_keyword_index = tokenized_keywords.index(phrase)
188
+
189
+ corresponding_sorted_keyword = sorted_keywords[matched_tokenized_keyword_index]
190
+
191
+ keyword_index_where_the_sorted_keyword_is_present = keywords.index(corresponding_sorted_keyword)
192
+
193
+ sentence_scores[i]+=phrase_scores[keyword_index_where_the_sorted_keyword_is_present]
194
+ i+=1
195
+
196
+ Reduce_to_percent = percent
197
+ summary_size = int(((Reduce_to_percent)/100)*len(sentences))
198
+
199
+ if summary_size == 0:
200
+ summary_size = 1
201
+
202
+ sorted_sentence_score_indices = np.flip(np.argsort(sentence_scores),0)
203
+
204
+ indices_for_summary_results = sorted_sentence_score_indices[0:summary_size]
205
+
206
+ summary = ""
207
+
208
+ current_size = 0
209
+
210
+ if 0 not in indices_for_summary_results and summary_size!=1:
211
+ summary+=sentences[0]
212
+ summary+=".\n\n"
213
+ current_size+=1
214
+
215
+
216
+ for i in range(0,len(sentences)):
217
+ if i in indices_for_summary_results:
218
+ summary+=sentences[i]
219
+ summary+=".\n\n"
220
+ current_size += 1
221
+ if current_size == summary_size:
222
+ break
223
+
224
+ yt = YouTube(url)
225
+ video_html = f'<iframe width="560" height="315" src="{yt.embed_url}" frameborder="0" allowfullscreen></iframe>'
226
+
227
+ return summary, video_html
228
+
229
+ """## Text-to-Speech Module"""
230
+
231
+ AUDIO_DIR = 'audio_files'
232
+ MAX_FILE_AGE = 24 * 60 * 60 # maximum age of audio files in seconds (24 hours)
233
+
234
+ def delete_old_audio_files():
235
+ # delete audio files older than MAX_FILE_AGE
236
+ now = time.time()
237
+ for file_name in os.listdir(AUDIO_DIR):
238
+ file_path = os.path.join(AUDIO_DIR, file_name)
239
+ if now - os.path.getmtime(file_path) > MAX_FILE_AGE:
240
+ os.remove(file_path)
241
+
242
+ def text_to_speech(input_text):
243
+ # create the text-to-speech audio
244
+ tts = gTTS(input_text, lang='en', slow=False)
245
+ fp = io.BytesIO()
246
+ tts.write_to_fp(fp)
247
+ fp.seek(0)
248
+
249
+ # create the audio directory if it does not exist
250
+ os.makedirs(AUDIO_DIR, exist_ok=True)
251
+
252
+ # generate a unique file name for the audio file
253
+ file_name = str(time.time()) + '.wav'
254
+ file_path = os.path.join(AUDIO_DIR, file_name)
255
+
256
+ # save the audio stream to a file
257
+ with open(file_path, 'wb') as f:
258
+ f.write(fp.read())
259
+
260
+ # delete old audio files
261
+ delete_old_audio_files()
262
+
263
+ # return the file path
264
+ return file_path
265
+
266
+ theme = gr.themes.Soft(
267
+ primary_hue="yellow",
268
+ #secondary_hue=gr.themes.Color(secondary_100="#f8f8f8", secondary_200="#d9d9d9", secondary_300="#a5b4fc", secondary_400="#818cf8", secondary_50="#faf0e4", secondary_500="#6366f1", secondary_600="#4f46e5", secondary_700="#4338ca", secondary_800="#3730a3", secondary_900="#312e81", secondary_950="#2b2c5e"),
269
+ neutral_hue="zinc",
270
+ ).set(
271
+ block_label_background_fill='*primary_50',
272
+ block_label_background_fill_dark='*body_background_fill',
273
+ )
274
+
275
+ with gr.Blocks(theme=theme) as demo:
276
+
277
+ gr.Markdown(
278
+ '''
279
+ <h1 align="center">SnipSnap Summarizer</h1>
280
+
281
+ Welcome to SnipSnap! This is an educational video transcript summarizer. Input a YouTube URL to get started.
282
+ '''
283
+ )
284
+
285
+ with gr.Row():
286
+ with gr.Column():
287
+ fn = summarize_text
288
+ url_input = gr.Textbox(label="URL", placeholder="Ex: https://youtu.be/JOiGEI9pQBs", info="Input YouTube URL")
289
+ slider = gr.Slider(5, 100, value=20, step=5, label="Percent", info="Choose summary length")
290
+
291
+ with gr.Row():
292
+ summarize_btn = gr.Button(variant="primary", value="Summarize")
293
+ clear_btn = gr.ClearButton()
294
+
295
+ video_preview = gr.HTML(label="Video Preview")
296
+
297
+ with gr.Column():
298
+ summary_output = gr.Textbox(label="Summary", show_copy_button=True)
299
+ tts_btn = gr.Button(variant="primary", value="Text-to-Speech")
300
+ summary_tts = gr.Audio(label="Text-to-Speech", interactive=False)
301
+
302
+ # Buttons
303
+ summarize_btn.click(summarize_text, inputs=[url_input, slider], outputs=[summary_output, video_preview])
304
+ tts_btn.click(text_to_speech, inputs=summary_output, outputs=summary_tts)
305
+ clear_btn.click(lambda:[None, gr.Slider(value=20), None, None, None], outputs=[url_input, slider, summary_output, video_preview, summary_tts])
306
+
307
+ demo.queue()
308
+ demo.launch()
long_stopwords.txt ADDED
@@ -0,0 +1,435 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # stopwords.txt
3
+ #
4
+ # Freely available stopword list, balancing coverage and size.
5
+ #
6
+ # From http://www.lextek.com/manuals/onix/stopwords1.html
7
+ a
8
+ about
9
+ above
10
+ across
11
+ after
12
+ again
13
+ against
14
+ all
15
+ almost
16
+ alone
17
+ along
18
+ already
19
+ also
20
+ although
21
+ always
22
+ among
23
+ an
24
+ and
25
+ another
26
+ any
27
+ anybody
28
+ anyone
29
+ anything
30
+ anywhere
31
+ are
32
+ area
33
+ areas
34
+ around
35
+ as
36
+ ask
37
+ asked
38
+ asking
39
+ asks
40
+ at
41
+ away
42
+ b
43
+ back
44
+ backed
45
+ backing
46
+ backs
47
+ be
48
+ became
49
+ because
50
+ become
51
+ becomes
52
+ been
53
+ before
54
+ began
55
+ behind
56
+ being
57
+ beings
58
+ best
59
+ better
60
+ between
61
+ big
62
+ both
63
+ but
64
+ by
65
+ c
66
+ came
67
+ can
68
+ cannot
69
+ case
70
+ cases
71
+ certain
72
+ certainly
73
+ clear
74
+ clearly
75
+ come
76
+ could
77
+ d
78
+ did
79
+ differ
80
+ different
81
+ differently
82
+ do
83
+ does
84
+ done
85
+ down
86
+ down
87
+ downed
88
+ downing
89
+ downs
90
+ during
91
+ e
92
+ each
93
+ early
94
+ either
95
+ end
96
+ ended
97
+ ending
98
+ ends
99
+ enough
100
+ even
101
+ evenly
102
+ ever
103
+ every
104
+ everybody
105
+ everyone
106
+ everything
107
+ everywhere
108
+ f
109
+ face
110
+ faces
111
+ fact
112
+ facts
113
+ far
114
+ felt
115
+ few
116
+ find
117
+ finds
118
+ first
119
+ for
120
+ four
121
+ from
122
+ full
123
+ fully
124
+ further
125
+ furthered
126
+ furthering
127
+ furthers
128
+ g
129
+ gave
130
+ general
131
+ generally
132
+ get
133
+ gets
134
+ give
135
+ given
136
+ gives
137
+ go
138
+ going
139
+ good
140
+ goods
141
+ got
142
+ great
143
+ greater
144
+ greatest
145
+ group
146
+ grouped
147
+ grouping
148
+ groups
149
+ h
150
+ had
151
+ has
152
+ have
153
+ having
154
+ he
155
+ her
156
+ here
157
+ herself
158
+ high
159
+ high
160
+ high
161
+ higher
162
+ highest
163
+ him
164
+ himself
165
+ his
166
+ how
167
+ however
168
+ i
169
+ if
170
+ important
171
+ in
172
+ interest
173
+ interested
174
+ interesting
175
+ interests
176
+ into
177
+ is
178
+ it
179
+ its
180
+ itself
181
+ j
182
+ just
183
+ k
184
+ keep
185
+ keeps
186
+ kind
187
+ knew
188
+ know
189
+ known
190
+ knows
191
+ l
192
+ large
193
+ largely
194
+ last
195
+ later
196
+ latest
197
+ least
198
+ less
199
+ let
200
+ lets
201
+ like
202
+ likely
203
+ long
204
+ longer
205
+ longest
206
+ m
207
+ made
208
+ make
209
+ making
210
+ man
211
+ many
212
+ may
213
+ me
214
+ member
215
+ members
216
+ men
217
+ might
218
+ more
219
+ most
220
+ mostly
221
+ mr
222
+ mrs
223
+ much
224
+ must
225
+ my
226
+ myself
227
+ n
228
+ necessary
229
+ need
230
+ needed
231
+ needing
232
+ needs
233
+ never
234
+ new
235
+ new
236
+ newer
237
+ newest
238
+ next
239
+ no
240
+ nobody
241
+ non
242
+ noone
243
+ not
244
+ nothing
245
+ now
246
+ nowhere
247
+ number
248
+ numbers
249
+ o
250
+ of
251
+ off
252
+ often
253
+ old
254
+ older
255
+ oldest
256
+ on
257
+ once
258
+ one
259
+ only
260
+ open
261
+ opened
262
+ opening
263
+ opens
264
+ or
265
+ order
266
+ ordered
267
+ ordering
268
+ orders
269
+ other
270
+ others
271
+ our
272
+ out
273
+ over
274
+ p
275
+ part
276
+ parted
277
+ parting
278
+ parts
279
+ per
280
+ perhaps
281
+ place
282
+ places
283
+ point
284
+ pointed
285
+ pointing
286
+ points
287
+ possible
288
+ present
289
+ presented
290
+ presenting
291
+ presents
292
+ problem
293
+ problems
294
+ put
295
+ puts
296
+ q
297
+ quite
298
+ r
299
+ rather
300
+ really
301
+ right
302
+ right
303
+ room
304
+ rooms
305
+ s
306
+ said
307
+ same
308
+ saw
309
+ say
310
+ says
311
+ second
312
+ seconds
313
+ see
314
+ seem
315
+ seemed
316
+ seeming
317
+ seems
318
+ sees
319
+ several
320
+ shall
321
+ she
322
+ should
323
+ show
324
+ showed
325
+ showing
326
+ shows
327
+ side
328
+ sides
329
+ since
330
+ small
331
+ smaller
332
+ smallest
333
+ so
334
+ some
335
+ somebody
336
+ someone
337
+ something
338
+ somewhere
339
+ state
340
+ states
341
+ still
342
+ still
343
+ such
344
+ sure
345
+ t
346
+ take
347
+ taken
348
+ than
349
+ that
350
+ the
351
+ their
352
+ them
353
+ then
354
+ there
355
+ therefore
356
+ these
357
+ they
358
+ thing
359
+ things
360
+ think
361
+ thinks
362
+ this
363
+ those
364
+ though
365
+ thought
366
+ thoughts
367
+ three
368
+ through
369
+ thus
370
+ to
371
+ today
372
+ together
373
+ too
374
+ took
375
+ toward
376
+ turn
377
+ turned
378
+ turning
379
+ turns
380
+ two
381
+ u
382
+ under
383
+ until
384
+ up
385
+ upon
386
+ us
387
+ use
388
+ used
389
+ uses
390
+ v
391
+ very
392
+ w
393
+ want
394
+ wanted
395
+ wanting
396
+ wants
397
+ was
398
+ way
399
+ ways
400
+ we
401
+ well
402
+ wells
403
+ went
404
+ were
405
+ what
406
+ when
407
+ where
408
+ whether
409
+ which
410
+ while
411
+ who
412
+ whole
413
+ whose
414
+ why
415
+ will
416
+ with
417
+ within
418
+ without
419
+ work
420
+ worked
421
+ working
422
+ works
423
+ would
424
+ x
425
+ y
426
+ year
427
+ years
428
+ yet
429
+ you
430
+ young
431
+ younger
432
+ youngest
433
+ your
434
+ yours
435
+ z