bigl34 commited on
Commit
8d3618c
·
1 Parent(s): bf389b8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +115 -0
app.py CHANGED
@@ -1,3 +1,118 @@
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
 
 
 
 
 
 
 
 
 
 
3
  gr.Interface.load("models/deepset/roberta-base-squad2").launch()
 
1
+
2
  import gradio as gr
3
+ import os
4
+ import numpy as np
5
+ os.system("pip install pdfminer.six rank_bm25 torch transformers")
6
+
7
+ from gradio.mix import Series
8
+ #import re
9
+ from rank_bm25 import BM25Okapi
10
+ import string
11
+ import torch
12
+ from transformers import pipeline
13
+ import pdfminer
14
+ from pdfminer.high_level import extract_text
15
+
16
+ len_doc = 500
17
+ overlap = 15
18
+ param_top_k_retriver = 15
19
+ param_top_k_ranker = 3
20
+
21
+ def read_pdf(file):
22
+ text = extract_text(file.name)
23
+ # Split text into smaller docs
24
+ docs = []
25
+
26
+ i = 0
27
+ while i < len(text):
28
+ docs.append(text[i:i+len_doc])
29
+ i = i + len_doc - overlap
30
+ return docs
31
+
32
+ # We use BM25 as retriver which will do 1st round of candidate filtering based on word based matching
33
+
34
+ def bm25_tokenizer(text):
35
+ stop_w = ['a', 'the', 'am', 'is' , 'are', 'who', 'how', 'where', 'when', 'why', 'what']
36
+ tokenized_doc = []
37
+ for token in text.lower().split():
38
+ token = token.strip(string.punctuation)
39
+
40
+ if len(token) > 0 and token not in stop_w:
41
+ tokenized_doc.append(token)
42
+ return tokenized_doc
43
+
44
+ def retrieval(query, top_k_retriver, docs, bm25_):
45
+
46
+ bm25_scores = bm25_.get_scores(bm25_tokenizer(query))
47
+ top_n = np.argsort(bm25_scores)[::-1][:top_k_retriver]
48
+ bm25_hits = [{'corpus_id': idx,
49
+ 'score': bm25_scores[idx],
50
+ 'docs':docs[idx]} for idx in top_n if bm25_scores[idx] > 0]
51
+ bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
52
+
53
+ return bm25_hits
54
+
55
+ def qa_ranker(query, docs_, top_k_ranker, qa_model):
56
+ ans = []
57
+ for doc in docs_:
58
+ answer = qa_model(question = query,
59
+ context = doc)
60
+ answer['doc'] = doc
61
+ ans.append(answer)
62
+ return sorted(ans, key=lambda x: x['score'], reverse=True)[:top_k_ranker]
63
+
64
+ def cstr(s, color='black'):
65
+ return "<text style=color:{}>{}</text>".format(color, s)
66
+ def cstr_bold(s, color='black'):
67
+ return "<text style=color:{}><b>{}</b></text>".format(color, s)
68
+ def cstr_break(s, color='black'):
69
+ return "<text style=color:{}><br>{}</text>".format(color, s)
70
+
71
+ def print_colored(text, start_idx, end_idx, confidence):
72
+ conf_str = '- Confidence: ' + confidence
73
+ a = cstr(' '.join([text[:start_idx], \
74
+ cstr_bold(text[start_idx:end_idx], color='blue'), \
75
+ text[end_idx:], \
76
+ cstr_break(conf_str, color='grey')]), color='black')
77
+ return a
78
+
79
+ def final_qa_pipeline(file, query, model_nm):
80
+ docs = read_pdf(file)
81
+ tokenized_corpus = []
82
+ for doc in docs:
83
+ tokenized_corpus.append(bm25_tokenizer(doc))
84
+
85
+ bm25 = BM25Okapi(tokenized_corpus)
86
+
87
+ top_k_retriver, top_k_ranker = param_top_k_retriver, param_top_k_ranker
88
+ lvl1 = retrieval(query, top_k_retriver, docs, bm25)
89
+
90
+ qa_model = pipeline("question-answering",
91
+ #model = "deepset/minilm-uncased-squad2")
92
+ model = "deepset/"+ str(model_nm))
93
+
94
+ if len(lvl1) > 0:
95
+ fnl_rank = qa_ranker(query, [l["docs"] for l in lvl1], top_k_ranker,qa_model)
96
+ top1 = print_colored(fnl_rank[0]['doc'], fnl_rank[0]['start'], fnl_rank[0]['end'], str(np.round(100*fnl_rank[0]["score"],1))+"%")
97
+ if len(lvl1)>1:
98
+ top2 = print_colored(fnl_rank[1]['doc'], fnl_rank[1]['start'], fnl_rank[1]['end'], str(np.round(100*fnl_rank[1]["score"],1))+"%")
99
+ else:
100
+ top2 = "None"
101
+ return (top1, top2)
102
+ else:
103
+ return ("No match","No match")
104
+
105
+ examples = [
106
+ ]
107
 
108
+ iface = gr.Interface(
109
+ fn = final_qa_pipeline,
110
+ inputs = [gr.inputs.File(label="input pdf file"), gr.inputs.Textbox(label="Question:"), gr.inputs.Dropdown(choices=["minilm-uncased-squad2","roberta-base-squad2"],label="Model")],
111
+ outputs = [gr.outputs.HTML(label="Top 1 answer"), gr.outputs.HTML(label="Top 2 answer")],
112
+ examples=examples,
113
+ theme = "grass",
114
+ title = "Question Answering on annual reports",
115
+ description = "Navigate long annual reports by using Machine learning to answer your questions. \nSimply upload any annual report pdf you are interested in and ask model a question OR load an example from below."
116
+ )
117
+ iface.launch(enable_queue = True)
118
  gr.Interface.load("models/deepset/roberta-base-squad2").launch()