Spaces:

DataRaptor
/

PatentMatch

Sleeping

App Files Files Community

DataRaptor commited on Jul 11, 2023

Commit

152844c

1 Parent(s): 86a60b5

Upload 5 files

Browse files

Files changed (5) hide show

app.py +161 -2
fold-0-train.csv +0 -0
infer.py +133 -0
model_weights.pth +3 -0
requirements.txt +6 -0

app.py CHANGED Viewed

@@ -1,8 +1,167 @@
 import streamlit as st
-st.markdown('Live demo will be available very soon.')
-st.markdown('Source code: https://github.com/dataraptor/PatentMatch/tree/main')

+import datetime
+import os
+import pathlib
+import requests
+import zipfile
+import pandas as pd
+import pydeck as pdk
+import geopandas as gpd
 import streamlit as st
+import leafmap.colormaps as cm
+from leafmap.common import hex_to_rgb
+import time
+from infer import USPPPMModel, USPPPMDataset
+import torch
+import pandas as pd
+@st.cache_resource
+def load_model():
+    model = USPPPMModel('microsoft/deberta-v3-small')
+    model.load_state_dict(torch.load('model_weights.pth', map_location=torch.device('cpu')))
+    model.eval()
+    ds = USPPPMDataset(model.tokenizer, 133)
+    return model, ds
+def infer(anchor, target, title):
+    model, ds = load_model()
+    d = {
+        'anchor': anchor,
+        'target': target,
+        'title': title,
+        'label': 0
+    }
+    x = ds[d][0]
+    with torch.no_grad():
+        y = model(x)
+    return y.cpu().numpy()[0][0]
+@st.cache_data
+def get_context():
+    df = pd.read_csv('./fold-0-train.csv')
+    l = list(set(list(df['title'].values)))
+    return l
+st.set_page_config(
+    page_title="PatentMatch",
+    page_icon="🧊",
+    layout="centered",
+    initial_sidebar_state="expanded",
+)
+# fix sidebar
+st.markdown("""
+    <style>
+        .css-vk3wp9 {
+            background-color: rgb(255 255 255);
+            }
+        .css-18l0hbk {
+            padding: 0.34rem 1.2rem !important;
+            margin: 0.125rem 2rem;
+            }
+        .css-nziaof {
+            padding: 0.34rem 1.2rem !important;
+            margin: 0.125rem 2rem;
+            background-color: rgb(181 197 227 / 18%) !important;
+            }
+    </style>
+    """, unsafe_allow_html=True
+)
+hide_st_style = """
+            <style>
+            #MainMenu {visibility: hidden;}
+            footer {visibility: hidden;}
+            header {visibility: hidden;}
+            </style>
+            """
+st.markdown(hide_st_style, unsafe_allow_html=True)
+def app():
+    st.title("PatentMatch: Patent Semantic Similarity Matcher")
+    #st.markdown("[![View in W&B](https://img.shields.io/badge/View%20in-W%26B-blue)](https://wandb.ai/<username>/<project_name>?workspace=user-<username>)")
+    st.markdown(
+        """This project is focused on developing a Transformer based NLP model to match phrases
+        in U.S. patents based on their semantic similarity within a specific
+        technical domain context. The trained model achieved Pearson correlation coefficient score of 0.745.
+        [[Source Code]](https://github.com/dataraptor/PatentMatch)
+    """
+    )
+    st.markdown('---')
+    # st.selectbox("Select from example",
+    #                          [
+    #                             "Example 1",
+    #                             "Example 2",
+    #                              ])
+    row1_col1, row1_col2, row1_col3 = st.columns(
+        [0.5, 0.4, 0.4]
+    )
+    # with row1_col1:
+    #     frequency = st.selectbox("Section",
+    #                              [
+    #                                 "A: Human Necessities",
+    #                                 "B: Operations and Transport",
+    #                                 "C: Chemistry and Metallurgy",
+    #                                 "D: Textiles",
+    #                                 "E: Fixed Constructions",
+    #                                 "F: Mechanical Engineering",
+    #                                 "G: Physics",
+    #                                 "H: Electricity",
+    #                                 "Y: Emerging Cross-Sectional Technologies",
+    #                                  ])
+    # with row1_col2:
+    #     class_box = st.selectbox("Class",
+    #                              [
+    #                                 "21",
+    #                                 "14",
+    #                                 "23",
+    #                                  ])
+    with row1_col1:
+        l = get_context()
+        context = st.selectbox("Context", l, l.index('basic electric elements'))
+    with row1_col2:
+        anchor = st.text_input("Anchor", "deflect light")
+    with row1_col3:
+        target = st.text_input("Target", "bending moment")
+    if st.button("Predict Scores", type="primary"):
+        with st.spinner("Predicting scores..."):
+            score = infer(anchor, target, context)
+            ss = st.success("Scores predicted successfully!")
+        score += 2.0
+        fmt = "{:<.3f}".format(score)
+        st.subheader(f"Similarity Score: {fmt}")
+app()
+# Display a footer with links and credits
+st.markdown("---")
+st.markdown("Built by [Shamim Ahamed](https://www.shamimahamed.com/). Data provided by [Kaggle](https://www.kaggle.com/competitions/us-patent-phrase-to-phrase-matching)")
+#st.markdown("Data provided by [The Feedback Prize - ELLIPSE Corpus Scoring Challenge on Kaggle](https://www.kaggle.com/c/feedbackprize-ellipse-corpus-scoring-challenge)")

fold-0-train.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

infer.py ADDED Viewed

	@@ -0,0 +1,133 @@

+from torch import nn
+from transformers import AutoConfig, AutoModel, AutoTokenizer
+import torch
+from torch.utils.data import Dataset
+class MeanPooling(nn.Module):
+    def __init__(self):
+        super(MeanPooling, self).__init__()
+    def forward(self, last_hidden_state, attention_mask):
+        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
+        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
+        sum_mask = input_mask_expanded.sum(1)
+        sum_mask = torch.clamp(sum_mask, min=1e-9)
+        mean_embeddings = sum_embeddings / sum_mask
+        return mean_embeddings
+class MeanPoolingLayer(nn.Module):
+    def __init__(self, input_size, target_size):
+        super(MeanPoolingLayer, self).__init__()
+        self.pool = MeanPooling()
+        self.fc = nn.Linear(input_size, target_size)
+    def forward(self, inputs, mask):
+        last_hidden_states = inputs[0]
+        feature = self.pool(last_hidden_states, mask)
+        outputs = self.fc(feature)
+        return outputs
+def weight_init_normal(module, model):
+    if isinstance(module, nn.Linear):
+        module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
+        if module.bias is not None:
+            module.bias.data.zero_()
+    elif isinstance(module, nn.Embedding):
+        module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
+        if module.padding_idx is not None:
+            module.weight.data[module.padding_idx].zero_()
+    elif isinstance(module, nn.LayerNorm):
+        module.bias.data.zero_()
+        module.weight.data.fill_(1.0)
+class USPPPMModel(nn.Module):
+    def __init__(self, backbone):
+        super(USPPPMModel, self).__init__()
+        self.config = AutoConfig.from_pretrained(backbone, output_hidden_states=True)
+        self.model = AutoModel.from_pretrained(backbone, config=self.config)
+        self.head = MeanPoolingLayer(768,1)
+        self.tokenizer = AutoTokenizer.from_pretrained(backbone);
+        # sectoks = ['[CTG]', '[CTX]', '[ANC]', '[TGT]']
+        # self.tokenizer.add_special_tokens({'additional_special_tokens': sectoks})
+        # self.model.resize_token_embeddings(len(self.tokenizer))
+    def _init_weights(self, layer):
+        for module in layer.modules():
+            init_fn = weight_init_normal
+            init_fn(module, self)
+            # print(type(module))
+    def forward(self, inputs):
+        outputs = self.model(**inputs)
+        outputs = self.head(outputs, inputs['attention_mask'])
+        return outputs
+table = """
+A: Human Necessities
+B: Operations and Transport
+C: Chemistry and Metallurgy
+D: Textiles
+E: Fixed Constructions
+F: Mechanical Engineering
+G: Physics
+H: Electricity
+Y: Emerging Cross-Sectional Technologies
+"""
+splits = [i for i in table.split('\n') if i != '']
+table = {e.split(': ')[0]: e.split(': ')[1] for e in splits}
+class USPPPMDataset(Dataset):
+    def __init__(self, tokenizer, max_length):
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+    def __len__(self): return 0
+    def __getitem__(self, x):
+        score = x['label']
+        sep = '' + self.tokenizer.sep_token + ''
+        s = x['anchor'] + sep + x['target'] + sep + x['title']
+        inputs = self.tokenizer(
+            s, add_special_tokens=True,
+            max_length=self.max_length, padding='max_length',
+            truncation=True,
+            return_offsets_mapping=False
+        )
+        for k, v in inputs.items(): inputs[k] = torch.tensor(v, dtype=torch.long).unsqueeze(dim=0)
+        label = torch.tensor(score, dtype=torch.float)
+        return inputs, label
+if __name__ == '__main__':
+    model = USPPPMModel('microsoft/deberta-v3-small')
+    model.load_state_dict(torch.load('model_weights.pth', map_location=torch.device('cpu')))
+    model.eval()
+    ds = USPPPMDataset(model.tokenizer, 133)
+    d = {
+        'anchor': 'sprayed',
+        'target': 'thermal sprayed coating',
+        'title': 'building',
+        'label': 0
+    }
+    inp = ds[d]
+    x = inp[0]
+    with torch.no_grad():
+        y = model(x)
+    print('y:', y)

model_weights.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b0b49ff053c7beac972a85d464305398ab93252901348418af1692e7ca0959dd
+size 565268017

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+streamlit==1.21.0
+Pillow
+protobuf
+torchvision==0.15.2
+torch==2.0.1
+numpy