| |
|
|
| import pandas as pd |
| import nltk |
| nltk.download("punkt") |
| from nltk import tokenize |
| import time |
| from sklearn.model_selection import train_test_split |
| from transformers import BertConfig, BertTokenizer, TFBertModel |
| from tensorflow.keras.preprocessing.sequence import pad_sequences |
| from tensorflow import convert_to_tensor |
| from tensorflow.keras.layers import Input, Dense |
| from tensorflow.keras.initializers import TruncatedNormal |
| from tensorflow.keras.models import Model |
| from tensorflow.keras.optimizers import Adam |
| from tensorflow.keras.metrics import CategoricalAccuracy, Precision, Recall |
|
|
|
|
| |
|
|
| DATA_PATH="..." |
|
|
| SAVE_MODEL_TO=".../" |
|
|
|
|
| |
|
|
| tab=pd.read_hdf(DATA_PATH) |
|
|
|
|
| |
|
|
| def data_to_values(dataframe): |
| """Converts data to values. |
| """ |
| abstracts=dataframe.Abstract.values |
| labels=dataframe.Label.values |
| return abstracts, labels |
| |
|
|
| def tokenize_abstracts(abstracts): |
| """For given texts, adds '[CLS]' and '[SEP]' tokens |
| at the beginning and the end of each sentence, respectively. |
| """ |
| t_abstracts=[] |
| for abstract in abstracts: |
| t_abstract="[CLS] " |
| for sentence in tokenize.sent_tokenize(abstract): |
| t_abstract=t_abstract + sentence + " [SEP] " |
| t_abstracts.append(t_abstract) |
| return t_abstracts |
|
|
|
|
| tokenizer=BertTokenizer.from_pretrained('bert-base-multilingual-uncased') |
|
|
|
|
| def b_tokenize_abstracts(t_abstracts, max_len=512): |
| """Tokenizes sentences with the help |
| of a 'bert-base-multilingual-uncased' tokenizer. |
| """ |
| b_t_abstracts=[tokenizer.tokenize(_)[:max_len] for _ in t_abstracts] |
| return b_t_abstracts |
|
|
|
|
| def convert_to_ids(b_t_abstracts): |
| """Converts tokens to its specific |
| IDs in a bert vocabulary. |
| """ |
| input_ids=[tokenizer.convert_tokens_to_ids(_) for _ in b_t_abstracts] |
| return input_ids |
|
|
|
|
| def abstracts_to_ids(abstracts): |
| """Tokenizes abstracts and converts |
| tokens to their specific IDs |
| in a bert vocabulary. |
| """ |
| tokenized_abstracts=tokenize_abstracts(abstracts) |
| b_tokenized_abstracts=b_tokenize_abstracts(tokenized_abstracts) |
| ids=convert_to_ids(b_tokenized_abstracts) |
| return ids |
|
|
|
|
| def pad_ids(input_ids, max_len=512): |
| """Padds sequences of a given IDs. |
| """ |
| p_input_ids=pad_sequences(input_ids, |
| maxlen=max_len, |
| dtype="long", |
| truncating="post", |
| padding="post") |
| return p_input_ids |
|
|
|
|
| def create_attention_masks(inputs): |
| """Creates attention masks |
| for a given seuquences. |
| """ |
| masks=[] |
| for sequence in inputs: |
| sequence_mask=[float(_>0) for _ in sequence] |
| masks.append(sequence_mask) |
| return masks |
|
|
|
|
| |
|
|
| def create_model(): |
| config=BertConfig.from_pretrained( |
| "bert-base-multilingual-uncased", |
| num_labels=16, |
| hidden_dropout_prob=0.2, |
| attention_probs_dropout_prob=0.2) |
| bert=TFBertModel.from_pretrained( |
| "bert-base-multilingual-uncased", |
| config=config) |
| bert_layer=bert.layers[0] |
| input_ids_layer=Input( |
| shape=(512), |
| name="input_ids", |
| dtype="int32") |
| input_attention_masks_layer=Input( |
| shape=(512), |
| name="attention_masks", |
| dtype="int32") |
| bert_model=bert_layer( |
| input_ids_layer, |
| input_attention_masks_layer) |
| target_layer=Dense( |
| units=16, |
| kernel_initializer=TruncatedNormal(stddev=config.initializer_range), |
| name="target_layer", |
| activation="softmax")(bert_model[1]) |
| model=Model( |
| inputs=[input_ids_layer, input_attention_masks_layer], |
| outputs=target_layer, |
| name="mbert_multiclass_16") |
| optimizer=Adam( |
| learning_rate=5e-05, |
| epsilon=1e-08, |
| decay=0.01, |
| clipnorm=1.0) |
| model.compile( |
| optimizer=optimizer, |
| loss="categorical_crossentropy", |
| metrics=[CategoricalAccuracy(), Precision(), Recall()]) |
| return model |
|
|
|
|
| abstracts, labels=data_to_values(tab) |
| ids=abstracts_to_ids(abstracts) |
| print("Abstracts tokenized, tokens converted to ids.") |
|
|
| padded_ids=pad_ids(ids) |
| print("Sequences padded.") |
|
|
| train_inputs, temp_inputs, train_labels, temp_labels=train_test_split(padded_ids, labels, random_state=1993, test_size=0.3) |
| validation_inputs, test_inputs, validation_labels, test_labels=train_test_split(temp_inputs, temp_labels, random_state=1993, test_size=0.5) |
| print("Data splited into train, validation, test sets.") |
|
|
| train_masks, validation_masks, test_masks=[create_attention_masks(_) for _ in [train_inputs, validation_inputs, test_inputs]] |
| print("Attention masks created.") |
| train_inputs, validation_inputs, test_inputs=[convert_to_tensor(_) for _ in [train_inputs, validation_inputs, test_inputs]] |
| print("Inputs converted to tensors.") |
| train_labels, validation_labels, test_labels=[convert_to_tensor(_) for _ in [train_labels, validation_labels, test_labels]] |
| print("Labels converted to tensors.") |
| train_masks, validation_masks, test_masks=[convert_to_tensor(_) for _ in [train_masks, validation_masks, test_masks]] |
| print("Masks converted to tensors.") |
|
|
|
|
| model=create_model() |
| print("Model initialized.") |
|
|
|
|
| history=model.fit([train_inputs, train_masks], train_labels, |
| batch_size=8, |
| epochs=2, |
| validation_data=([validation_inputs, validation_masks], validation_labels)) |
|
|
|
|
| model.save(SAVE_MODEL_TO+"mbert_multiclass_16.h5") |
| print("Model saved.") |
|
|
| test_score=model.evaluate([test_inputs, test_masks], test_labels, |
| batch_size=8) |
|
|
| print("Model tested.") |
|
|
|
|
| stats=pd.DataFrame(test_score, columns=["loss", "accuracy", "precision", "recall"]) |
| stats.to_excel(SAVE_MODEL_TO+"mbert_multiclass_16_stats.xlsx", index=False) |
|
|
| print("Stats saved.") |
|
|