| |
|
|
| import pandas as pd |
| import glob |
| from nltk import tokenize |
| from transformers import BertTokenizer, TFBertModel, BertConfig |
| from transformers.utils.dummy_tf_objects import TFBertMainLayer |
| from tensorflow.keras.preprocessing.sequence import pad_sequences |
| from tensorflow import convert_to_tensor |
| from tensorflow.keras.layers import Input, Dense |
| from tensorflow.keras.initializers import TruncatedNormal |
| from tensorflow.keras.models import load_model, Model |
| from tensorflow.keras.optimizers import Adam |
| from tensorflow.keras.metrics import BinaryAccuracy, Precision, Recall |
|
|
|
|
| |
|
|
| DATA="..." |
|
|
| MODELS=".../" |
|
|
| SAVE_PREDICTIONS_TO="..." |
|
|
|
|
| |
|
|
| def tokenize_abstracts(abstracts): |
| """For given texts, adds '[CLS]' and '[SEP]' tokens |
| at the beginning and the end of each sentence, respectively. |
| """ |
| t_abstracts=[] |
| for abstract in abstracts: |
| t_abstract="[CLS] " |
| for sentence in tokenize.sent_tokenize(abstract): |
| t_abstract=t_abstract + sentence + " [SEP] " |
| t_abstracts.append(t_abstract) |
| return t_abstracts |
|
|
|
|
| tokenizer=BertTokenizer.from_pretrained('bert-base-multilingual-uncased') |
|
|
|
|
| def b_tokenize_abstracts(t_abstracts, max_len=512): |
| """Tokenizes sentences with the help |
| of a 'bert-base-multilingual-uncased' tokenizer. |
| """ |
| b_t_abstracts=[tokenizer.tokenize(_)[:max_len] for _ in t_abstracts] |
| return b_t_abstracts |
|
|
|
|
| def convert_to_ids(b_t_abstracts): |
| """Converts tokens to its specific |
| IDs in a bert vocabulary. |
| """ |
| input_ids=[tokenizer.convert_tokens_to_ids(_) for _ in b_t_abstracts] |
| return input_ids |
|
|
|
|
| def abstracts_to_ids(abstracts): |
| """Tokenizes abstracts and converts |
| tokens to their specific IDs |
| in a bert vocabulary. |
| """ |
| tokenized_abstracts=tokenize_abstracts(abstracts) |
| b_tokenized_abstracts=b_tokenize_abstracts(tokenized_abstracts) |
| ids=convert_to_ids(b_tokenized_abstracts) |
| return ids |
|
|
|
|
| def pad_ids(input_ids, max_len=512): |
| """Padds sequences of a given IDs. |
| """ |
| p_input_ids=pad_sequences(input_ids, |
| maxlen=max_len, |
| dtype="long", |
| truncating="post", |
| padding="post") |
| return p_input_ids |
|
|
|
|
| def create_attention_masks(inputs): |
| """Creates attention masks |
| for a given seuquences. |
| """ |
| masks=[] |
| for sequence in inputs: |
| sequence_mask=[float(_>0) for _ in sequence] |
| masks.append(sequence_mask) |
| return masks |
|
|
|
|
| |
|
|
| def float_to_percent(float, decimal=3): |
| """Takes a float from range 0. to 0.9... as an input |
| and converts it to a percentage with specified decimal places. |
| """ |
| return str(float*100)[:(decimal+3)]+"%" |
|
|
|
|
| def models_predict(directory, inputs, attention_masks, float_to_percent=False): |
| """Loads separate .h5 models from a given directory. |
| For predictions, inputs are expected to be: |
| tensors of token's ids (bert vocab) and tensors of attention masks. |
| Output is of format: |
| {'model/target N': [the probability of a text N dealing with the target N , ...], ...} |
| """ |
| models=glob.glob(f"{directory}*.h5") |
| predictions_dict={} |
| for _ in models: |
| model=load_model(_) |
| print(f"Model {_} is loaded.") |
| predictions=model.predict_step([inputs, attention_masks]) |
| print(f"Predictions from the model {_} are finished.") |
| predictions=[float(_) for _ in predictions] |
| if float_to_percent==True: |
| predictions=[float_to_percent(_) for _ in predictions] |
| predictions_dict[model.name]=predictions |
| print(f"Predictions from the model {_} are saved.") |
| del predictions, model |
| return predictions_dict |
|
|
|
|
| def predictions_dict_to_df(predictions_dictionary): |
| """Converts model's predictions of format: |
| {'model/target N': [the probability of a text N dealing with the target N , ...], ...} |
| to a dataframe of format: |
| | text N | the probability of the text N dealing with the target N | ... | |
| """ |
| predictions_df=pd.DataFrame(predictions_dictionary) |
| predictions_df.columns=[_.replace("model_", "").replace("_", ".") for _ in predictions_df.columns] |
| predictions_df.insert(0, column="text", value=[_ for _ in range(len(predictions_df))]) |
| return predictions_df |
|
|
|
|
| def predictions_above_treshold(predictions_dataframe, treshold=0.95): |
| """Filters predictions above specified treshold. |
| Input is expected to be a dataframe of format: |
| | text N | the probability of the text N dealing with the target N | ... | |
| Output is of format: |
| {text N: [target N dealing with probability > trheshold with text N, ...], ...} |
| """ |
| above_treshold_dict={} |
| above_treshold=predictions_dataframe.iloc[:,1:].apply(lambda row: row[row > treshold].index, axis=1) |
| for _ in range(len(above_treshold)): |
| above_treshold_dict[_]=list(above_treshold[_]) |
| return above_treshold_dict |
|
|
|
|
| |
|
|
| marks=[_ for _ in range(int(len(DATA)/100))] |
|
|
| output=pd.DataFrame() |
|
|
| for _ in marks: |
| if _ == 0: |
| abstracts=DATA[_: (_+1)*100] |
| else: |
| abstracts=DATA[_*100: (_+1)*100] |
| ids=abstracts_to_ids(abstracts) |
| padded_ids=pad_ids(ids) |
| masks=create_attention_masks(padded_ids) |
| masks=convert_to_tensor(masks) |
| inputs=convert_to_tensor(padded_ids) |
| predictions=models_predict(MODELS, inputs, masks) |
| predictions_df=predictions_dict_to_df(predictions) |
| output=output.append(predictions_df) |
| del abstracts, predictions, predictions_df |
|
|
| if len(DATA)!=((marks[-1]+1)*100): |
| rest_idx=((marks[-1]+1)*100) |
| abstracts=DATA[rest_idx:] |
| ids=abstracts_to_ids(abstracts) |
| padded_ids=pad_ids(ids) |
| masks=create_attention_masks(padded_ids) |
| masks=convert_to_tensor(masks) |
| inputs=convert_to_tensor(padded_ids) |
| predictions=models_predict(MODELS, inputs, masks) |
| predictions_df=predictions_dict_to_df(predictions) |
| output=output.append(predictions_df) |
| del abstracts, predictions, predictions_df |
|
|
|
|
| output.to_excel("SAVE_PREDICTIONS_TO/predictions.xlsx", index=False) |
|
|