Spaces:

Axelottle
/

SnipSnap

Build error

App Files Files Community

Axelottle commited on Nov 28, 2023

Commit

c476330

1 Parent(s): 10cc91a

initial commit

Browse files

Files changed (3) hide show

README.md +2 -8
app.py +308 -0
long_stopwords.txt +435 -0

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
 ---
 title: SnipSnap
-emoji: 🏃
-colorFrom: red
-colorTo: indigo
-sdk: gradio
-sdk_version: 4.7.1
 app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: SnipSnap
 app_file: app.py
+sdk: gradio
+sdk_version: 3.39.0
 ---

app.py ADDED Viewed

	@@ -0,0 +1,308 @@

+#pip install gradio nltk youtube-transcript-api pytube gtts --quiet
+from __future__ import division
+import nltk
+import string
+import re
+import io, os, time
+import numpy as np
+import gradio as gr
+from tempfile import TemporaryFile
+from gtts import gTTS
+from pytube import YouTube
+from youtube_transcript_api import YouTubeTranscriptApi
+from nltk import word_tokenize
+from nltk.stem import WordNetLemmatizer
+from collections import defaultdict
+nltk.download('punkt')
+nltk.download('averaged_perceptron_tagger')
+nltk.download('wordnet')
+"""## Transcript Summary Module"""
+def summarize_text(url, percent):
+  # Check if the URL is valid
+  try:
+      youtube = YouTube(url)
+  except Exception as e:
+      raise gr.Error(f"Invalid YouTube URL")
+  # Get transcript using youtube-transcript-api
+  try:
+      transcript = YouTubeTranscriptApi.get_transcript(youtube.video_id)
+      Text = ' '.join([entry['text'] for entry in transcript])
+  except Exception as e:
+      raise gr.Error(f"Could not retrieve the video's transcript. Please try another video")
+  # Clean text
+  Cleaned_text = re.sub(r'[^a-zA-Z0-9\._-]', ' ', Text)
+  text = word_tokenize(Cleaned_text)
+  case_insensitive_text = word_tokenize(Cleaned_text.lower())
+  # Sentence Segmentation
+  sentences = []
+  tokenized_sentences = []
+  sentence = " "
+  for word in text:
+      if word != '.':
+          sentence+=str(word)+" "
+      else:
+          sentences.append(sentence.strip())
+          tokenized_sentences.append(word_tokenize(sentence.lower().strip()))
+          sentence = " "
+  def lemmatize(POS_tagged_text):
+      wordnet_lemmatizer = WordNetLemmatizer()
+      adjective_tags = ['JJ','JJR','JJS']
+      lemmatized_text = []
+      for word in POS_tagged_text:
+          if word[1] in adjective_tags:
+              lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0],pos="a")))
+          else:
+              lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0]))) #default POS = noun
+      return lemmatized_text
+  #Pre_processing:
+  POS_tagged_text = nltk.pos_tag(case_insensitive_text)
+  lemmatized_text = lemmatize(POS_tagged_text)
+  Processed_text = nltk.pos_tag(lemmatized_text)
+  def generate_stopwords(POS_tagged_text):
+    stopwords = []
+    wanted_POS = ['NN','NNS','NNP','NNPS','JJ','JJR','JJS','FW'] #may be add VBG too
+    for word in POS_tagged_text:
+        if word[1] not in wanted_POS:
+            stopwords.append(word[0])
+    punctuations = list(str(string.punctuation))
+    stopwords = stopwords + punctuations
+    stopword_file = open("long_stopwords.txt", "r")
+    #Source = https://www.ranks.nl/stopwords
+    for line in stopword_file.readlines():
+        stopwords.append(str(line.strip()))
+    return set(stopwords)
+  stopwords = generate_stopwords(Processed_text)
+  def partition_phrases(text,delimeters):
+    phrases = []
+    phrase = " "
+    for word in text:
+        if word in delimeters:
+            if phrase!= " ":
+                phrases.append(str(phrase).split())
+            phrase = " "
+        elif word not in delimeters:
+            phrase+=str(word)
+            phrase+=" "
+    return phrases
+  phrase_list = partition_phrases(lemmatized_text,stopwords)
+  phrase_partitioned_sentences = []
+  for sentence in tokenized_sentences:
+      POS_tagged_sentence = nltk.pos_tag(sentence)
+      lemmatized_sentence = lemmatize(POS_tagged_sentence)
+      phrase_partitioned_sentence = partition_phrases(lemmatized_sentence,stopwords)
+      phrase_partitioned_sentences.append(phrase_partitioned_sentence)
+   # keyword scoring
+  frequency = defaultdict(int)
+  degree = defaultdict(int)
+  word_score = defaultdict(float)
+  vocabulary = []
+  for phrase in phrase_list:
+      for word in phrase:
+          frequency[word]+=1
+          degree[word]+=len(phrase)
+          if word not in vocabulary:
+              vocabulary.append(word)
+  for word in vocabulary:
+      word_score[word] = degree[word]/frequency[word]
+  phrase_scores = []
+  keywords = []
+  phrase_vocabulary = []
+  for phrase in phrase_list:
+      if phrase not in phrase_vocabulary:
+          phrase_score = 0
+          for word in phrase:
+              phrase_score += word_score[word]
+          phrase_scores.append(phrase_score)
+          phrase_vocabulary.append(phrase)
+  phrase_vocabulary = []
+  for phrase in phrase_list:
+      if phrase not in phrase_vocabulary:
+          keyword=''
+          for word in phrase:
+              keyword += str(word)+" "
+          phrase_vocabulary.append(phrase)
+          keyword = keyword.strip()
+          keywords.append(keyword)
+  sorted_index = np.flip(np.argsort(phrase_scores),0)
+  tokenized_keywords = []
+  sorted_keywords = []
+  keywords_num = 0
+  threshold = 50
+  if len(keywords)<threshold:
+      keywords_num = len(keywords)
+  else:
+      keywords_num = threshold
+  for i in range(0,keywords_num):
+      sorted_keywords.append(keywords[sorted_index[i]])
+      tokenized_keywords.append(sorted_keywords[i].split())
+  sentence_scores = np.zeros((len(sentences)),np.float32)
+  i=0
+  for sentence in phrase_partitioned_sentences:
+      for phrase in sentence:
+          if phrase in tokenized_keywords:
+              matched_tokenized_keyword_index = tokenized_keywords.index(phrase)
+              corresponding_sorted_keyword = sorted_keywords[matched_tokenized_keyword_index]
+              keyword_index_where_the_sorted_keyword_is_present = keywords.index(corresponding_sorted_keyword)
+              sentence_scores[i]+=phrase_scores[keyword_index_where_the_sorted_keyword_is_present]
+      i+=1
+  Reduce_to_percent = percent
+  summary_size = int(((Reduce_to_percent)/100)*len(sentences))
+  if summary_size == 0:
+      summary_size = 1
+  sorted_sentence_score_indices = np.flip(np.argsort(sentence_scores),0)
+  indices_for_summary_results = sorted_sentence_score_indices[0:summary_size]
+  summary = ""
+  current_size = 0
+  if 0 not in indices_for_summary_results and summary_size!=1:
+      summary+=sentences[0]
+      summary+=".\n\n"
+      current_size+=1
+  for i in range(0,len(sentences)):
+      if i in indices_for_summary_results:
+          summary+=sentences[i]
+          summary+=".\n\n"
+          current_size += 1
+      if current_size == summary_size:
+          break
+  yt = YouTube(url)
+  video_html = f'<iframe width="560" height="315" src="{yt.embed_url}" frameborder="0" allowfullscreen></iframe>'
+  return summary, video_html
+"""## Text-to-Speech Module"""
+AUDIO_DIR = 'audio_files'
+MAX_FILE_AGE = 24 * 60 * 60  # maximum age of audio files in seconds (24 hours)
+def delete_old_audio_files():
+    # delete audio files older than MAX_FILE_AGE
+    now = time.time()
+    for file_name in os.listdir(AUDIO_DIR):
+        file_path = os.path.join(AUDIO_DIR, file_name)
+        if now - os.path.getmtime(file_path) > MAX_FILE_AGE:
+            os.remove(file_path)
+def text_to_speech(input_text):
+    # create the text-to-speech audio
+    tts = gTTS(input_text, lang='en', slow=False)
+    fp = io.BytesIO()
+    tts.write_to_fp(fp)
+    fp.seek(0)
+    # create the audio directory if it does not exist
+    os.makedirs(AUDIO_DIR, exist_ok=True)
+    # generate a unique file name for the audio file
+    file_name = str(time.time()) + '.wav'
+    file_path = os.path.join(AUDIO_DIR, file_name)
+    # save the audio stream to a file
+    with open(file_path, 'wb') as f:
+        f.write(fp.read())
+    # delete old audio files
+    delete_old_audio_files()
+    # return the file path
+    return file_path
+theme = gr.themes.Soft(
+    primary_hue="yellow",
+    #secondary_hue=gr.themes.Color(secondary_100="#f8f8f8", secondary_200="#d9d9d9", secondary_300="#a5b4fc", secondary_400="#818cf8", secondary_50="#faf0e4", secondary_500="#6366f1", secondary_600="#4f46e5", secondary_700="#4338ca", secondary_800="#3730a3", secondary_900="#312e81", secondary_950="#2b2c5e"),
+    neutral_hue="zinc",
+).set(
+    block_label_background_fill='*primary_50',
+    block_label_background_fill_dark='*body_background_fill',
+)
+with gr.Blocks(theme=theme) as demo:
+  gr.Markdown(
+      '''
+      <h1 align="center">SnipSnap Summarizer</h1>
+      Welcome to SnipSnap! This is an educational video transcript summarizer. Input a YouTube URL to get started.
+      '''
+  )
+  with gr.Row():
+    with gr.Column():
+      fn = summarize_text
+      url_input = gr.Textbox(label="URL", placeholder="Ex: https://youtu.be/JOiGEI9pQBs", info="Input YouTube URL")
+      slider = gr.Slider(5, 100, value=20, step=5, label="Percent", info="Choose summary length")
+      with gr.Row():
+        summarize_btn = gr.Button(variant="primary", value="Summarize")
+        clear_btn = gr.ClearButton()
+      video_preview = gr.HTML(label="Video Preview")
+    with gr.Column():
+      summary_output = gr.Textbox(label="Summary", show_copy_button=True)
+      tts_btn = gr.Button(variant="primary", value="Text-to-Speech")
+      summary_tts = gr.Audio(label="Text-to-Speech", interactive=False)
+    # Buttons
+    summarize_btn.click(summarize_text, inputs=[url_input, slider], outputs=[summary_output, video_preview])
+    tts_btn.click(text_to_speech, inputs=summary_output, outputs=summary_tts)
+    clear_btn.click(lambda:[None, gr.Slider(value=20), None, None, None], outputs=[url_input, slider, summary_output, video_preview, summary_tts])
+demo.queue()
+demo.launch()

long_stopwords.txt ADDED Viewed

	@@ -0,0 +1,435 @@

+#
+#  stopwords.txt
+#
+#  Freely available stopword list, balancing coverage and size.
+#
+#  From http://www.lextek.com/manuals/onix/stopwords1.html
+a
+about
+above
+across
+after
+again
+against
+all
+almost
+alone
+along
+already
+also
+although
+always
+among
+an
+and
+another
+any
+anybody
+anyone
+anything
+anywhere
+are
+area
+areas
+around
+as
+ask
+asked
+asking
+asks
+at
+away
+b
+back
+backed
+backing
+backs
+be
+became
+because
+become
+becomes
+been
+before
+began
+behind
+being
+beings
+best
+better
+between
+big
+both
+but
+by
+c
+came
+can
+cannot
+case
+cases
+certain
+certainly
+clear
+clearly
+come
+could
+d
+did
+differ
+different
+differently
+do
+does
+done
+down
+down
+downed
+downing
+downs
+during
+e
+each
+early
+either
+end
+ended
+ending
+ends
+enough
+even
+evenly
+ever
+every
+everybody
+everyone
+everything
+everywhere
+f
+face
+faces
+fact
+facts
+far
+felt
+few
+find
+finds
+first
+for
+four
+from
+full
+fully
+further
+furthered
+furthering
+furthers
+g
+gave
+general
+generally
+get
+gets
+give
+given
+gives
+go
+going
+good
+goods
+got
+great
+greater
+greatest
+group
+grouped
+grouping
+groups
+h
+had
+has
+have
+having
+he
+her
+here
+herself
+high
+high
+high
+higher
+highest
+him
+himself
+his
+how
+however
+i
+if
+important
+in
+interest
+interested
+interesting
+interests
+into
+is
+it
+its
+itself
+j
+just
+k
+keep
+keeps
+kind
+knew
+know
+known
+knows
+l
+large
+largely
+last
+later
+latest
+least
+less
+let
+lets
+like
+likely
+long
+longer
+longest
+m
+made
+make
+making
+man
+many
+may
+me
+member
+members
+men
+might
+more
+most
+mostly
+mr
+mrs
+much
+must
+my
+myself
+n
+necessary
+need
+needed
+needing
+needs
+never
+new
+new
+newer
+newest
+next
+no
+nobody
+non
+noone
+not
+nothing
+now
+nowhere
+number
+numbers
+o
+of
+off
+often
+old
+older
+oldest
+on
+once
+one
+only
+open
+opened
+opening
+opens
+or
+order
+ordered
+ordering
+orders
+other
+others
+our
+out
+over
+p
+part
+parted
+parting
+parts
+per
+perhaps
+place
+places
+point
+pointed
+pointing
+points
+possible
+present
+presented
+presenting
+presents
+problem
+problems
+put
+puts
+q
+quite
+r
+rather
+really
+right
+right
+room
+rooms
+s
+said
+same
+saw
+say
+says
+second
+seconds
+see
+seem
+seemed
+seeming
+seems
+sees
+several
+shall
+she
+should
+show
+showed
+showing
+shows
+side
+sides
+since
+small
+smaller
+smallest
+so
+some
+somebody
+someone
+something
+somewhere
+state
+states
+still
+still
+such
+sure
+t
+take
+taken
+than
+that
+the
+their
+them
+then
+there
+therefore
+these
+they
+thing
+things
+think
+thinks
+this
+those
+though
+thought
+thoughts
+three
+through
+thus
+to
+today
+together
+too
+took
+toward
+turn
+turned
+turning
+turns
+two
+u
+under
+until
+up
+upon
+us
+use
+used
+uses
+v
+very
+w
+want
+wanted
+wanting
+wants
+was
+way
+ways
+we
+well
+wells
+went
+were
+what
+when
+where
+whether
+which
+while
+who
+whole
+whose
+why
+will
+with
+within
+without
+work
+worked
+working
+works
+would
+x
+y
+year
+years
+yet
+you
+young
+younger
+youngest
+your
+yours
+z