| import os |
| from datetime import datetime |
|
|
| from pathlib import Path |
|
|
| import polars as pl |
| import torch |
| from transformers import AutoModel, AutoTokenizer |
| from transformers import Trainer, TrainingArguments |
| from accelerate import Accelerator, DistributedType |
| from torch.optim import AdamW |
| from torch.utils.data import DataLoader |
|
|
| from utils._constants import * |
| from utils._nlp import get_transformers_word_embeddings |
| from utils._polars import concat_str_columns, slice_join_dataframes |
| from utils._articles import ( |
| convert_text2encoding_with_transformers, |
| create_article_id_to_value_mapping |
| ) |
| from utils._behaviors import ( |
| create_binary_labels_column, |
| sampling_strategy_wu2019, |
| truncate_history, |
| ) |
| from dataset.pytorch_dataloader import ( |
| ebnerd_from_path, |
| NRMSDataset, |
| ) |
| from evaluation import ( |
| MetricEvaluator, |
| AucScore, |
| NdcgScore, |
| MrrScore, |
| F1Score, |
| LogLossScore, |
| RootMeanSquaredError, |
| AccuracyScore |
| ) |
| from models.nrms import NRMSModel |
| from datasets import Dataset, DatasetDict |
| import pyarrow as pa |
| import pyarrow.parquet as pq |
| import polars as pl |
|
|
|
|
| COLUMNS = ["impression_id", DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL, DEFAULT_INVIEW_ARTICLES_COL] |
|
|
| test_first_df = pl.read_parquet("testset_joined.parquet") |
| schema = pa.schema([ |
| ("impression_id", pa.int32()), |
| ("user_id", pa.int32()), |
| ("article_id_fixed", pa.list_(pa.int32())), |
| ("article_ids_inview", pa.list_(pa.int32())), |
| ]) |
| exp_writer = pq.ParquetWriter("merged_0412_final.parquet", schema) |
| only_writer = pq.ParquetWriter("merged_0412_joined_only.parquet", schema) |
|
|
| for idx, rows in enumerate(test_first_df.select(COLUMNS).iter_slices()): |
| print(idx, "\n") |
| org_table = pa.Table.from_pandas(rows.to_pandas(), schema=schema) |
| only_writer.write_table(org_table) |
|
|
| df = rows.explode("article_ids_inview").with_columns(pl.col("article_ids_inview").map_elements(lambda x: [x])) |
| exp_table = pa.Table.from_pandas(df.to_pandas(), schema=schema) |
| exp_writer.write_table(exp_table) |
|
|
| only_writer.close() |
| exp_writer.close() |
|
|
| del test_first_df |
| del schema |
|
|
| merged_0412_joined_only_df = Dataset.from_parquet("merged_0412_joined_only.parquet") |
| ebnerd_testset = DatasetDict({ |
| "testset": merged_0412_joined_only_df, |
| }) |
| ebnerd_testset.push_to_hub( |
| repo_id="mbhr/EB-NeRD", |
| config_name="join_test", |
| data_dir="data/join_test", |
| ) |
|
|
| del merged_0412_joined_only_df |
| del ebnerd_testset |
|
|
| merged_0412_final_df = Dataset.from_parquet("merged_0412_final.parquet") |
| ebnerd_testset = DatasetDict({ |
| "testset": merged_0412_final_df, |
| }) |
| ebnerd_testset.push_to_hub( |
| repo_id="mbhr/EB-NeRD", |
| config_name="join_test_exp", |
| data_dir="data/join_test_exp", |
| ) |
|
|