sritang commited on
Commit
9aa38c0
·
1 Parent(s): db632ab

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -11
app.py CHANGED
@@ -1,16 +1,25 @@
 
 
 
 
 
1
  from datasets import load_dataset
2
- import logging
3
- import time
4
- from pathlib import Path
5
- import contextlib
6
 
7
- _here = Path(__file__).parent
8
- pdf_obj = _here / "H561907.pdf"
9
 
 
 
10
 
11
- import gradio as gr
12
- import pdfminer
13
- from pdfminer.high_level import extract_text
 
 
 
 
14
 
15
- print('d')
16
- print(pdf_obj)
 
 
 
 
1
+ #import numpy as np
2
+ import gradio as gr
3
+ #import random
4
+ from sentence_transformers import SentenceTransformer, CrossEncoder, util
5
+ from torch import tensor as torch_tensor
6
  from datasets import load_dataset
 
 
 
 
7
 
8
+ """# import models"""
 
9
 
10
+ bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
11
+ bi_encoder.max_seq_length = 256 #Truncate long passages to 256 tokens
12
 
13
+ #The bi-encoder will retrieve top_k documents. We use a cross-encoder, to re-rank the results list to improve the quality
14
+ cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
15
+
16
+ """# import datasets"""
17
+
18
+ dataset = load_dataset("gfhayworth/hack_policy", split='train')
19
+ mypassages = list(dataset.to_pandas()['psg'])
20
 
21
+ dataset_embed = load_dataset("gfhayworth/hack_policy_embed", split='train')
22
+ dataset_embed_pd = dataset_embed.to_pandas()
23
+ dataset_embed_pd
24
+ type(dataset_embed_pd)
25
+ mycorpus_embeddings = torch_tensor(dataset_embed_pd.values)