| import pandas as pd |
| from sklearn.pipeline import Pipeline |
| from sklearn.feature_selection import ColumnSelector |
| from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
| from utils.utilities import * |
| import sys |
| from pprint import pprint |
|
|
| CONFIG_FILE_PATH = "/Users/luis.morales/Desktop/arxiv-paper-recommender/models/configs.yaml" |
| config = read_yaml_config(CONFIG_FILE_PATH) |
| pprint(config) |
|
|
| @execution_time |
| def train_tfidf(): |
| df = pd.read_parquet("/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/arxiv_papers.parquet.gzip") \ |
| .sample(500000) \ |
| .reset_index(drop=True) |
| |
| |
| vectorizer = TfidfVectorizer(**config["models"]["tfidf"]["tfidf_deffault"]) |
| pprint(config["models"]["tfidf"]["tfidf_deffault"]) |
| sys.exit() |
| |
| vectors = vectorizer.fit_transform(df['cleaned_abstracts']) |
|
|
| tfidf_df = pd.DataFrame(vectors.toarray(), columns=[i for i in vectorizer.get_feature_names_out()]) |
|
|
|
|
| tfidf_df.to_parquet("/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/reduced_arxiv_tfidf.parquet.gzip") |
| |
| train_tfidf() |
|
|
|
|