Spaces:
Sleeping
Sleeping
Commit ·
152844c
1
Parent(s): 86a60b5
Upload 5 files
Browse files- app.py +161 -2
- fold-0-train.csv +0 -0
- infer.py +133 -0
- model_weights.pth +3 -0
- requirements.txt +6 -0
app.py
CHANGED
|
@@ -1,8 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
|
| 6 |
|
| 7 |
-
st.markdown('Source code: https://github.com/dataraptor/PatentMatch/tree/main')
|
| 8 |
|
|
|
|
| 1 |
+
import datetime
|
| 2 |
+
import os
|
| 3 |
+
import pathlib
|
| 4 |
+
import requests
|
| 5 |
+
import zipfile
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import pydeck as pdk
|
| 8 |
+
import geopandas as gpd
|
| 9 |
import streamlit as st
|
| 10 |
+
import leafmap.colormaps as cm
|
| 11 |
+
from leafmap.common import hex_to_rgb
|
| 12 |
+
import time
|
| 13 |
+
from infer import USPPPMModel, USPPPMDataset
|
| 14 |
+
import torch
|
| 15 |
+
import pandas as pd
|
| 16 |
|
| 17 |
+
@st.cache_resource
|
| 18 |
+
def load_model():
|
| 19 |
+
model = USPPPMModel('microsoft/deberta-v3-small')
|
| 20 |
+
model.load_state_dict(torch.load('model_weights.pth', map_location=torch.device('cpu')))
|
| 21 |
+
model.eval()
|
| 22 |
+
ds = USPPPMDataset(model.tokenizer, 133)
|
| 23 |
+
return model, ds
|
| 24 |
+
|
| 25 |
+
def infer(anchor, target, title):
|
| 26 |
+
model, ds = load_model()
|
| 27 |
+
d = {
|
| 28 |
+
'anchor': anchor,
|
| 29 |
+
'target': target,
|
| 30 |
+
'title': title,
|
| 31 |
+
'label': 0
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
x = ds[d][0]
|
| 35 |
+
with torch.no_grad():
|
| 36 |
+
y = model(x)
|
| 37 |
+
|
| 38 |
+
return y.cpu().numpy()[0][0]
|
| 39 |
+
|
| 40 |
+
@st.cache_data
|
| 41 |
+
def get_context():
|
| 42 |
+
df = pd.read_csv('./fold-0-train.csv')
|
| 43 |
+
l = list(set(list(df['title'].values)))
|
| 44 |
+
return l
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
st.set_page_config(
|
| 50 |
+
page_title="PatentMatch",
|
| 51 |
+
page_icon="🧊",
|
| 52 |
+
layout="centered",
|
| 53 |
+
initial_sidebar_state="expanded",
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
# fix sidebar
|
| 59 |
+
st.markdown("""
|
| 60 |
+
<style>
|
| 61 |
+
.css-vk3wp9 {
|
| 62 |
+
background-color: rgb(255 255 255);
|
| 63 |
+
}
|
| 64 |
+
.css-18l0hbk {
|
| 65 |
+
padding: 0.34rem 1.2rem !important;
|
| 66 |
+
margin: 0.125rem 2rem;
|
| 67 |
+
}
|
| 68 |
+
.css-nziaof {
|
| 69 |
+
padding: 0.34rem 1.2rem !important;
|
| 70 |
+
margin: 0.125rem 2rem;
|
| 71 |
+
background-color: rgb(181 197 227 / 18%) !important;
|
| 72 |
+
}
|
| 73 |
+
</style>
|
| 74 |
+
""", unsafe_allow_html=True
|
| 75 |
+
)
|
| 76 |
+
hide_st_style = """
|
| 77 |
+
<style>
|
| 78 |
+
#MainMenu {visibility: hidden;}
|
| 79 |
+
footer {visibility: hidden;}
|
| 80 |
+
header {visibility: hidden;}
|
| 81 |
+
</style>
|
| 82 |
+
"""
|
| 83 |
+
st.markdown(hide_st_style, unsafe_allow_html=True)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def app():
|
| 88 |
+
|
| 89 |
+
st.title("PatentMatch: Patent Semantic Similarity Matcher")
|
| 90 |
+
#st.markdown("[](https://wandb.ai/<username>/<project_name>?workspace=user-<username>)")
|
| 91 |
+
|
| 92 |
+
st.markdown(
|
| 93 |
+
"""This project is focused on developing a Transformer based NLP model to match phrases
|
| 94 |
+
in U.S. patents based on their semantic similarity within a specific
|
| 95 |
+
technical domain context. The trained model achieved Pearson correlation coefficient score of 0.745.
|
| 96 |
+
[[Source Code]](https://github.com/dataraptor/PatentMatch)
|
| 97 |
+
"""
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
st.markdown('---')
|
| 101 |
+
# st.selectbox("Select from example",
|
| 102 |
+
# [
|
| 103 |
+
# "Example 1",
|
| 104 |
+
# "Example 2",
|
| 105 |
+
# ])
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
row1_col1, row1_col2, row1_col3 = st.columns(
|
| 109 |
+
[0.5, 0.4, 0.4]
|
| 110 |
+
)
|
| 111 |
+
# with row1_col1:
|
| 112 |
+
# frequency = st.selectbox("Section",
|
| 113 |
+
# [
|
| 114 |
+
# "A: Human Necessities",
|
| 115 |
+
# "B: Operations and Transport",
|
| 116 |
+
# "C: Chemistry and Metallurgy",
|
| 117 |
+
# "D: Textiles",
|
| 118 |
+
# "E: Fixed Constructions",
|
| 119 |
+
# "F: Mechanical Engineering",
|
| 120 |
+
# "G: Physics",
|
| 121 |
+
# "H: Electricity",
|
| 122 |
+
# "Y: Emerging Cross-Sectional Technologies",
|
| 123 |
+
# ])
|
| 124 |
+
# with row1_col2:
|
| 125 |
+
# class_box = st.selectbox("Class",
|
| 126 |
+
# [
|
| 127 |
+
# "21",
|
| 128 |
+
# "14",
|
| 129 |
+
# "23",
|
| 130 |
+
# ])
|
| 131 |
+
|
| 132 |
+
with row1_col1:
|
| 133 |
+
l = get_context()
|
| 134 |
+
context = st.selectbox("Context", l, l.index('basic electric elements'))
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
with row1_col2:
|
| 138 |
+
anchor = st.text_input("Anchor", "deflect light")
|
| 139 |
+
with row1_col3:
|
| 140 |
+
target = st.text_input("Target", "bending moment")
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
if st.button("Predict Scores", type="primary"):
|
| 146 |
+
with st.spinner("Predicting scores..."):
|
| 147 |
+
score = infer(anchor, target, context)
|
| 148 |
+
ss = st.success("Scores predicted successfully!")
|
| 149 |
+
|
| 150 |
+
score += 2.0
|
| 151 |
+
fmt = "{:<.3f}".format(score)
|
| 152 |
+
st.subheader(f"Similarity Score: {fmt}")
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
app()
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
# Display a footer with links and credits
|
| 161 |
+
st.markdown("---")
|
| 162 |
+
st.markdown("Built by [Shamim Ahamed](https://www.shamimahamed.com/). Data provided by [Kaggle](https://www.kaggle.com/competitions/us-patent-phrase-to-phrase-matching)")
|
| 163 |
+
#st.markdown("Data provided by [The Feedback Prize - ELLIPSE Corpus Scoring Challenge on Kaggle](https://www.kaggle.com/c/feedbackprize-ellipse-corpus-scoring-challenge)")
|
| 164 |
|
| 165 |
|
| 166 |
|
|
|
|
| 167 |
|
fold-0-train.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
infer.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from torch import nn
|
| 2 |
+
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
| 3 |
+
import torch
|
| 4 |
+
from torch.utils.data import Dataset
|
| 5 |
+
|
| 6 |
+
class MeanPooling(nn.Module):
|
| 7 |
+
def __init__(self):
|
| 8 |
+
super(MeanPooling, self).__init__()
|
| 9 |
+
|
| 10 |
+
def forward(self, last_hidden_state, attention_mask):
|
| 11 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
|
| 12 |
+
sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
|
| 13 |
+
sum_mask = input_mask_expanded.sum(1)
|
| 14 |
+
sum_mask = torch.clamp(sum_mask, min=1e-9)
|
| 15 |
+
mean_embeddings = sum_embeddings / sum_mask
|
| 16 |
+
return mean_embeddings
|
| 17 |
+
|
| 18 |
+
class MeanPoolingLayer(nn.Module):
|
| 19 |
+
def __init__(self, input_size, target_size):
|
| 20 |
+
super(MeanPoolingLayer, self).__init__()
|
| 21 |
+
self.pool = MeanPooling()
|
| 22 |
+
self.fc = nn.Linear(input_size, target_size)
|
| 23 |
+
|
| 24 |
+
def forward(self, inputs, mask):
|
| 25 |
+
last_hidden_states = inputs[0]
|
| 26 |
+
feature = self.pool(last_hidden_states, mask)
|
| 27 |
+
outputs = self.fc(feature)
|
| 28 |
+
return outputs
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def weight_init_normal(module, model):
|
| 32 |
+
if isinstance(module, nn.Linear):
|
| 33 |
+
module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
|
| 34 |
+
if module.bias is not None:
|
| 35 |
+
module.bias.data.zero_()
|
| 36 |
+
elif isinstance(module, nn.Embedding):
|
| 37 |
+
module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
|
| 38 |
+
if module.padding_idx is not None:
|
| 39 |
+
module.weight.data[module.padding_idx].zero_()
|
| 40 |
+
elif isinstance(module, nn.LayerNorm):
|
| 41 |
+
module.bias.data.zero_()
|
| 42 |
+
module.weight.data.fill_(1.0)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class USPPPMModel(nn.Module):
|
| 46 |
+
def __init__(self, backbone):
|
| 47 |
+
super(USPPPMModel, self).__init__()
|
| 48 |
+
self.config = AutoConfig.from_pretrained(backbone, output_hidden_states=True)
|
| 49 |
+
self.model = AutoModel.from_pretrained(backbone, config=self.config)
|
| 50 |
+
self.head = MeanPoolingLayer(768,1)
|
| 51 |
+
self.tokenizer = AutoTokenizer.from_pretrained(backbone);
|
| 52 |
+
|
| 53 |
+
# sectoks = ['[CTG]', '[CTX]', '[ANC]', '[TGT]']
|
| 54 |
+
# self.tokenizer.add_special_tokens({'additional_special_tokens': sectoks})
|
| 55 |
+
# self.model.resize_token_embeddings(len(self.tokenizer))
|
| 56 |
+
|
| 57 |
+
def _init_weights(self, layer):
|
| 58 |
+
for module in layer.modules():
|
| 59 |
+
init_fn = weight_init_normal
|
| 60 |
+
init_fn(module, self)
|
| 61 |
+
# print(type(module))
|
| 62 |
+
|
| 63 |
+
def forward(self, inputs):
|
| 64 |
+
outputs = self.model(**inputs)
|
| 65 |
+
outputs = self.head(outputs, inputs['attention_mask'])
|
| 66 |
+
return outputs
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
table = """
|
| 70 |
+
A: Human Necessities
|
| 71 |
+
B: Operations and Transport
|
| 72 |
+
C: Chemistry and Metallurgy
|
| 73 |
+
D: Textiles
|
| 74 |
+
E: Fixed Constructions
|
| 75 |
+
F: Mechanical Engineering
|
| 76 |
+
G: Physics
|
| 77 |
+
H: Electricity
|
| 78 |
+
Y: Emerging Cross-Sectional Technologies
|
| 79 |
+
"""
|
| 80 |
+
splits = [i for i in table.split('\n') if i != '']
|
| 81 |
+
table = {e.split(': ')[0]: e.split(': ')[1] for e in splits}
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
class USPPPMDataset(Dataset):
|
| 86 |
+
def __init__(self, tokenizer, max_length):
|
| 87 |
+
self.tokenizer = tokenizer
|
| 88 |
+
self.max_length = max_length
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def __len__(self): return 0
|
| 92 |
+
|
| 93 |
+
def __getitem__(self, x):
|
| 94 |
+
score = x['label']
|
| 95 |
+
|
| 96 |
+
sep = '' + self.tokenizer.sep_token + ''
|
| 97 |
+
|
| 98 |
+
s = x['anchor'] + sep + x['target'] + sep + x['title']
|
| 99 |
+
|
| 100 |
+
inputs = self.tokenizer(
|
| 101 |
+
s, add_special_tokens=True,
|
| 102 |
+
max_length=self.max_length, padding='max_length',
|
| 103 |
+
truncation=True,
|
| 104 |
+
return_offsets_mapping=False
|
| 105 |
+
)
|
| 106 |
+
for k, v in inputs.items(): inputs[k] = torch.tensor(v, dtype=torch.long).unsqueeze(dim=0)
|
| 107 |
+
label = torch.tensor(score, dtype=torch.float)
|
| 108 |
+
return inputs, label
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
if __name__ == '__main__':
|
| 114 |
+
model = USPPPMModel('microsoft/deberta-v3-small')
|
| 115 |
+
model.load_state_dict(torch.load('model_weights.pth', map_location=torch.device('cpu')))
|
| 116 |
+
model.eval()
|
| 117 |
+
|
| 118 |
+
ds = USPPPMDataset(model.tokenizer, 133)
|
| 119 |
+
|
| 120 |
+
d = {
|
| 121 |
+
'anchor': 'sprayed',
|
| 122 |
+
'target': 'thermal sprayed coating',
|
| 123 |
+
'title': 'building',
|
| 124 |
+
'label': 0
|
| 125 |
+
}
|
| 126 |
+
inp = ds[d]
|
| 127 |
+
x = inp[0]
|
| 128 |
+
|
| 129 |
+
with torch.no_grad():
|
| 130 |
+
y = model(x)
|
| 131 |
+
print('y:', y)
|
| 132 |
+
|
| 133 |
+
|
model_weights.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b0b49ff053c7beac972a85d464305398ab93252901348418af1692e7ca0959dd
|
| 3 |
+
size 565268017
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit==1.21.0
|
| 2 |
+
Pillow
|
| 3 |
+
protobuf
|
| 4 |
+
torchvision==0.15.2
|
| 5 |
+
torch==2.0.1
|
| 6 |
+
numpy
|