Spaces:

belgrano91
/

SentenceRecognizer

Build error

App Files Files Community

belgrano91 commited on Dec 1, 2022

Commit

8228dae

1 Parent(s): d32a483

Upload functions.py

Browse files

Files changed (1) hide show

functions.py +226 -0

functions.py ADDED Viewed

	@@ -0,0 +1,226 @@

+#*********************************************************************
+# This archive could be a potential first stone of the project.
+# Now contains only functions used throughout the files, but
+# in the future could contain more complex structures.
+#*********************************************************************
+import pdfplumber
+import docx2txt
+import os
+import re
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sentence_transformers import SentenceTransformer, models,util
+import nltk
+from nltk.tokenize import sent_tokenize, wordpunct_tokenize
+nltk.download("punkt")
+def reading_word(string):
+    text = docx2txt.process("var.docx")
+    return text
+def reading_pdf(string):
+    all_text=""
+    with pdfplumber.open(string) as pdf:
+        for pdf_page in pdf.pages:
+            bold=pdf_page.filter(lambda obj: not(obj["object_type"] == "char" and obj["size"]>=10 ))
+            single_page_text = bold.extract_text(x_tolerance=2)
+        #print( single_page_text )
+        # separate each page's text with newline
+            all_text = all_text + '\n' + single_page_text
+    return all_text
+def reading_file(string):
+    """"
+    -----------------------------------------------------------------------------
+    This function takes as arguments the file that we want to analyze. Depending the file type we use some python library.
+    For the moment we detect only: PDF and Words.
+    Returns: Long string with all the sentences in the document
+    -----------------------------------------------------------------------------
+    Input:
+    string: path of the file we want to analyze
+    """
+    ext = os.path.splitext(string)[-1].lower()
+    if ext == ".pdf":
+        text=reading_pdf(string)
+    elif ext == ".docx":
+        text=reading_word(string)
+    else:
+        print ("Unknown file format.")
+    return text
+def splitting(word: str, text):
+    if word=="line":
+        tok_text = list(filter(lambda a: a != '', text)) #Remove empty lines
+    elif word=="sentences":
+        #tok_text1=text.split('. ')
+        tok_text=sent_tokenize(text)
+    elif word=="paragraphs":
+        tok_text=re.split(r'\n{2,}', text)
+        for i in tok_text:
+            if len(i)<50:
+                tok_text.remove(i)
+    elif word=="words":
+        tok_text=wordpunct_tokenize(text)
+    return tok_text
+def filtering(text):
+    """"
+    -----------------------------------------------------------------------------
+    This function takes as arguments the string obtained in the reading step and filters out undesired characters.
+    Potential things to filter: Index of contents, titles, formulas, references, tables (?)
+    Returns: Long string with all the sentences in the document.
+    -----------------------------------------------------------------------------
+    Input:
+    string: string obtained in the previous reading step.
+    """
+    clean1=re.sub("\d{1,}.\d{1,}.+","", text) #removing number of the table of contents
+    clean1=re.sub("\w{1,} \w{1,} \.{4,} \d{1,}\d{1,}\n|\w{1,} \.{4,} \d{1,}\d{1,}\n|\w{1,} \w{1,} \w{1,} \.{4,} \d{1,}\d{1,}\n","",clean1) #removing number of the table of contents
+    clean1=re.sub(" \n\d{1,} \n | \n\d{1,} \n \n |\d{1,}\. \w{1,} \w{1,}", "", clean1)
+    clean1=re.sub("\.{4,} \d{1,}|\.{4,} Error! Bookmark not defined.", " ",clean1) #filtering the index
+    clean1=re.sub("\n\n\n\n\n+|\n \n+", " ",clean1)#filtering long page jumps
+    clean1=re.sub("\no |\n\uf0b7","",clean1)
+    #clean1=re.sub(" \n"," ",clean1)
+    return clean1
+def ctrlf(words: list, text):
+    b=[]
+    for word in words:
+        #print("Sentences matching the word ", word, ":\n")
+        a=re.findall(f"[^.]* {word} [^.]*\.", text)
+        #a=re.findall(fr"(?i)\b{word}\b [^.]*\.", text) #matching a sentence that contains a word case insensitive
+        for i in range(len(a)):
+            #print(i+1,".-", a[i])
+            b = b + [a[i]]
+        #print("--------------------------------------------------")
+    return b
+def everything_vs_word(query, corpus, model_name, number=5, score_function=util.cos_sim, ax=None):
+    """"
+    -----------------------------------------------------------------------------
+    This function takes as arguments the text that we want to compare, the query with respect to we want to
+    compare, and then the number of comparisons we wanna show (by defect 5), the model used, and the metric used
+    to compute the similarity (by defect cosine similarity).
+    Returns: Histogram plot
+    -----------------------------------------------------------------------------
+    Input:
+    query: String
+    corpus: String or list of strings (usually the latter for a document --> list of sentences)
+    number: Int
+    model_name: String
+    score_function: Function
+    ax: Axis object
+    """
+    # model info retrieval
+    model = SentenceTransformer(model_name)
+    n=len(query)
+    # tokenize according to the model
+    corpus_embedding = model.encode(corpus, convert_to_tensor=True)
+    query_embedding = model.encode(query, convert_to_tensor=True)
+    # semantic search gives a list of lists composed of dictionaries
+    hits = util.semantic_search(query_embedding, corpus_embedding,top_k=number,score_function=score_function)
+    hits = hits[0]
+    #print("Comparing ", query, " VS:")
+    scoring=[]
+    corp=[]
+    for hit in hits:
+        #print(corpus[hit['corpus_id']], "(Score: {:.4f})".format(hit['score']))
+        scoring.append(hit['score'])
+        corp.append(corpus[hit['corpus_id']])
+    # defining dataframe for easiness in plotting
+    data = pd.DataFrame(np.column_stack([corp, scoring]),
+                               columns=['Expression', 'Score'])
+    data.sort_values(by=['Score'], ascending=False)
+    data = data.explode('Score')
+    data['Score'] = data['Score'].astype('float')
+    return sns.barplot(data=data.reset_index(), ax=ax, x='Score', y='Expression')
+def sim(query, corpus, model_name, number=5, score_function=util.cos_sim):
+    # model info retrieval
+    model = SentenceTransformer(model_name)
+    n=len(query)
+    # tokenize according to the model
+    corpus_embedding = model.encode(corpus, convert_to_tensor=True)
+    query_embedding = model.encode(query, convert_to_tensor=True)
+    # semantic search gives a list of lists composed of dictionaries
+    hits = util.semantic_search(query_embedding, corpus_embedding,top_k=number,score_function=score_function)
+    hits = hits[0]
+    #print("Comparing ", query, " VS:")
+    scoring=[]
+    corp=[]
+    for hit in hits:
+        #print(corpus[hit['corpus_id']], "(Score: {:.4f})".format(hit['score']))
+        scoring.append(hit['score'])
+        corp.append(corpus[hit['corpus_id']])
+    # defining dataframe for easiness in plotting
+    data = pd.DataFrame(np.column_stack([corp, scoring]),
+                               columns=['Expression', 'Score'])
+    data.sort_values(by=['Score'], ascending=False)
+    data = data.explode('Score')
+    data['Score'] = data['Score'].astype('float')
+    return data
+def sim_2(query: list, corpus, model_name, threshold,number=5, score_function=util.cos_sim):
+    frames=[]
+    for i in query:
+        frames = frames + [functions.sim(query[i], corpus, model_name=model_name, number=number, score_function=util.cos_sim)]
+    result = pd.DataFrame(frames)
+    result=result.sort_values(by=['Score'], ascending=False)
+    result.drop_duplicates(subset=['Expression'], inplace=True)
+    return result
+############ EXTRA BALL ################
+# detecting the conclusion and getting all the sentences of that paragraph for future use.
+def conclusion():
+    return
+########## Get a function with the distribution of the results per word