| import pandas as pd |
| from gensim.similarities import SparseMatrixSimilarity |
| import argparse |
| import logging |
| import time |
|
|
| from utils.utilities import read_yaml_config, validate_and_create_subfolders |
| from utils.mlutilities import * |
|
|
| import logging |
| import sys |
|
|
| logging.basicConfig( |
| level=logging.INFO, |
| format="%(asctime)s [%(levelname)s] %(message)s", |
| handlers=[ |
| logging.FileHandler("debug.log"), |
| logging.StreamHandler(sys.stdout) |
| ] |
| ) |
|
|
|
|
| model_configurations = read_yaml_config("/Users/luis.morales/Desktop/arxiv-paper-recommender/src/models/configs.yaml") |
|
|
|
|
| if __name__ == "__main__": |
| """ |
| Example: |
| python3 ./src/models/train_recommender.py --modelsize Medium |
| |
| """ |
| |
| parser = argparse.ArgumentParser(description='ArXiv Paper Recommender CLI') |
| parser.add_argument('--modelsize',choices=["Large", "SubLarge", "Medium", "Small"], default=None, type=str, help='Model Size') |
|
|
| args = parser.parse_args() |
| model_size = args.modelsize |
| start = time.time() |
| |
| |
| if model_size is None: |
| raise Exception("The `modelsize` flag was no passed to the CLI.") |
| |
| |
| model_config = model_configurations["GensimConfig"][model_size] |
| model_name = model_configurations["GensimConfig"][model_size]["ModelName"] |
| dataset_fraq_split = model_configurations["GensimConfig"][model_size]["DataSetFracSplit"] |
| random_seed = model_configurations["GensimConfig"][model_size]["RandomSeedSplit"] |
| logging.info(f"Started training of {model_name} Model.") |
| |
|
|
| validate_and_create_subfolders( |
| model_name=model_name |
| ) |
| logging.info(f"Model Folder `{model_name}` was created successfully.") |
| |
| |
| if dataset_fraq_split is None: |
| df = pd.read_parquet("/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/reduced_arxiv_papers.parquet.gzip") |
| logging.info(f"The full text Corpus was readed.") |
| |
| else : |
| df = pd.read_parquet("/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/reduced_arxiv_papers.parquet.gzip") \ |
| .sample(frac=dataset_fraq_split, random_state=random_seed) \ |
| .reset_index(drop=True) |
| logging.info(f"A random split of {dataset_fraq_split}% was applied on the Text Corpus ") |
| logging.info(f"Dimensions of the dataset: {df.shape}") |
| |
| df.to_parquet(f"/Users/luis.morales/Desktop/arxiv-paper-recommender/models/data/{model_name}.parquet.gzip", compression='gzip') |
| logging.info(f"The Dataset used for this training was successfully saved in: `/Users/luis.morales/Desktop/arxiv-paper-recommender/models/data/{model_name}.parquet.gzip`.") |
| |
| |
|
|
| corpus = df['cleaned_abstracts'].to_list() |
| tokenized_corpus = gensim_tokenizer(corpus) |
| logging.info(f"Dictionary Learned on the {model_name} corpus dataset.") |
| |
| |
| dictionary = get_gensim_dictionary(tokenized_docs=tokenized_corpus, dict_name=model_name, save_dict=True) |
| logging.info("Dictionary Saved Locally.") |
| |
| |
| BoW_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in tokenized_corpus] |
| tfidf_model = TfidfModel(BoW_corpus) |
| logging.info(f"TD-IDF {model_name} Model was successfully trained.") |
| |
| |
| tfidf_model.save(f"/Users/luis.morales/Desktop/arxiv-paper-recommender/models/tfidf/{model_name}.model") |
| logging.info(f"Model: {model_name} was successfully saved.") |
|
|
|
|
| index = SparseMatrixSimilarity(tfidf_model[BoW_corpus], num_features=len(dictionary)) |
| logging.info(f"The Similarities Sparse Matrix was successfully created.") |
| index.save(f"/Users/luis.morales/Desktop/arxiv-paper-recommender/models/similarities_matrix/{model_name}") |
| logging.info(f"The Similarities Matrix was successfully saved for the model: {model_name}.") |
| |
| end = time.time() |
| total_time = end - start |
| logging.info(f"Full Training of {model_size} model took {total_time} secs.") |
| logging.info(f"The {model_name} Model was successfully trained! yei :)") |
| |
| |