diff --git "a/Analysis_code/deeplearning_model_multi.ipynb" "b/Analysis_code/deeplearning_model_multi.ipynb" deleted file mode 100644--- "a/Analysis_code/deeplearning_model_multi.ipynb" +++ /dev/null @@ -1,2676 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 통합 적용 모델 구성" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### 통일 데이터셋 변환 함수 정의" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'%pip install torch torchvision scikit-learn'" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "'''%pip install torch torchvision scikit-learn'''" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "import numpy as np\n", - "import random\n", - "\n", - "# Python 및 Numpy 시드 고정\n", - "seed = 42\n", - "random.seed(seed)\n", - "np.random.seed(seed)\n", - "\n", - "# PyTorch 시드 고정\n", - "torch.manual_seed(seed)\n", - "torch.cuda.manual_seed(seed)\n", - "torch.cuda.manual_seed_all(seed) # Multi-GPU 환경에서 동일한 시드 적용\n", - "\n", - "# PyTorch 연산의 결정적 모드 설정\n", - "torch.backends.cudnn.deterministic = True # 실행마다 동일한 결과를 보장\n", - "torch.backends.cudnn.benchmark = True # 성능 최적화를 활성화 (가능한 한 빠른 연산 수행)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "from sklearn.preprocessing import StandardScaler, LabelEncoder\n", - "import torch\n", - "from torch.utils.data import DataLoader, TensorDataset\n", - "import random\n", - "\n", - "# 전처리 함수\n", - "def preprocessing(df):\n", - " df = df[df.columns].copy()\n", - " df.loc[df['wind_dir']=='정온', 'wind_dir'] = \"0\"\n", - " df['wind_dir'] = df['wind_dir'].astype('int')\n", - " df['lm_cloudcover'] = df['lm_cloudcover'].astype('int')\n", - " df['cloudcover'] = df['cloudcover'].astype('int')\n", - " return df\n", - "\n", - "# 데이터셋 준비 함수\n", - "def prepare_dataset(region, data_sample='pure', target='multi', fold=3):\n", - "\n", - " # 데이터 경로 지정\n", - " dat_path = f\"../data/data_for_modeling/{region}_train.csv\"\n", - " if data_sample == 'pure':\n", - " train_path = dat_path\n", - " else:\n", - " train_path = f'../data/data_oversampled/{data_sample}/{data_sample}_{fold}_{region}.csv'\n", - " test_path = f\"../data/data_for_modeling/{region}_test.csv\"\n", - " drop_col = ['binary_class','multi_class','visi','year']\n", - " target_col = f'{target}_class'\n", - " \n", - " # 데이터 로드\n", - " region_dat = preprocessing(pd.read_csv(dat_path, index_col=0))\n", - " if data_sample == 'pure':\n", - " region_train = region_dat.loc[~region_dat['year'].isin([2021-fold]), :]\n", - " else:\n", - " region_train = preprocessing(pd.read_csv(train_path))\n", - " region_val = region_dat.loc[region_dat['year'].isin([2021-fold]), :]\n", - " region_test = preprocessing(pd.read_csv(test_path))\n", - "\n", - " # 컬럼 정렬 (일관성 유지)\n", - " common_columns = region_train.columns.to_list()\n", - " train_data = region_train[common_columns]\n", - " val_data = region_val[common_columns]\n", - " test_data = region_test[common_columns]\n", - "\n", - " # 설명변수 & 타겟 분리\n", - " X_train = train_data.drop(columns=drop_col)\n", - " y_train = train_data[target_col]\n", - " X_val = val_data.drop(columns=drop_col)\n", - " y_val = val_data[target_col]\n", - " X_test = test_data.drop(columns=drop_col)\n", - " y_test = test_data[target_col]\n", - "\n", - " # 범주형 & 연속형 변수 분리\n", - " categorical_cols = X_train.select_dtypes(include=['object', 'category', 'int64']).columns\n", - " numerical_cols = X_train.select_dtypes(include=['float64']).columns\n", - "\n", - " # 범주형 변수 Label Encoding\n", - " label_encoders = {}\n", - " for col in categorical_cols:\n", - " le = LabelEncoder()\n", - " le.fit(X_train[col]) # Train 데이터 기준으로 학습\n", - " label_encoders[col] = le\n", - "\n", - " # 변환 적용\n", - " for col in categorical_cols:\n", - " X_train[col] = label_encoders[col].transform(X_train[col])\n", - " X_val[col] = label_encoders[col].transform(X_val[col])\n", - " X_test[col] = label_encoders[col].transform(X_test[col])\n", - "\n", - " # 연속형 변수 Standard Scaling\n", - " scaler = StandardScaler()\n", - " scaler.fit(X_train[numerical_cols]) # Train 데이터 기준으로 학습\n", - "\n", - " # 변환 적용\n", - " X_train[numerical_cols] = scaler.transform(X_train[numerical_cols])\n", - " X_val[numerical_cols] = scaler.transform(X_val[numerical_cols])\n", - " X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])\n", - "\n", - " return X_train, X_val, X_test, y_train, y_val, y_test, categorical_cols, numerical_cols" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "from sklearn.preprocessing import StandardScaler, LabelEncoder\n", - "import torch\n", - "from torch.utils.data import DataLoader, TensorDataset\n", - "import random\n", - "\n", - "# 전처리 함수\n", - "def preprocessing(df):\n", - " df = df[df.columns].copy()\n", - " df.loc[df['wind_dir']=='정온', 'wind_dir'] = \"0\"\n", - " df['wind_dir'] = df['wind_dir'].astype('int')\n", - " df['lm_cloudcover'] = df['lm_cloudcover'].astype('int')\n", - " df['cloudcover'] = df['cloudcover'].astype('int')\n", - " return df\n", - "\n", - "# 데이터 변환 및 dataloader 생성 함수\n", - "def prepare_dataloader(region, data_sample='pure', target='multi', fold=3, random_state=None):\n", - "\n", - " # 데이터 경로 지정\n", - " dat_path = f\"../data/data_for_modeling/{region}_train.csv\"\n", - " if data_sample == 'pure':\n", - " train_path = dat_path\n", - " else:\n", - " train_path = f'../data/data_oversampled/{data_sample}/{data_sample}_{fold}_{region}.csv'\n", - " test_path = f\"../data/data_for_modeling/{region}_test.csv\"\n", - " drop_col = ['binary_class','multi_class','visi','year']\n", - " target_col = f'{target}_class'\n", - " \n", - " # 데이터 로드\n", - " region_dat = preprocessing(pd.read_csv(dat_path, index_col=0))\n", - " if data_sample == 'pure':\n", - " region_train = region_dat.loc[~region_dat['year'].isin([2021-fold]), :]\n", - " else:\n", - " region_train = preprocessing(pd.read_csv(train_path))\n", - " region_val = region_dat.loc[region_dat['year'].isin([2021-fold]), :]\n", - " region_test = preprocessing(pd.read_csv(test_path))\n", - "\n", - " # 컬럼 정렬 (일관성 유지)\n", - " common_columns = region_train.columns.to_list()\n", - " train_data = region_train[common_columns]\n", - " val_data = region_val[common_columns]\n", - " test_data = region_test[common_columns]\n", - "\n", - " # 설명변수 & 타겟 분리\n", - " X_train = train_data.drop(columns=drop_col)\n", - " y_train = train_data[target_col]\n", - " X_val = val_data.drop(columns=drop_col)\n", - " y_val = val_data[target_col]\n", - " X_test = test_data.drop(columns=drop_col)\n", - " y_test = test_data[target_col]\n", - "\n", - " # 범주형 & 연속형 변수 분리\n", - " categorical_cols = X_train.select_dtypes(include=['object', 'category', 'int64']).columns\n", - " numerical_cols = X_train.select_dtypes(include=['float64']).columns\n", - "\n", - " # 범주형 변수 Label Encoding\n", - " label_encoders = {}\n", - " for col in categorical_cols:\n", - " le = LabelEncoder()\n", - " le.fit(X_train[col]) # Train 데이터 기준으로 학습\n", - " label_encoders[col] = le\n", - "\n", - " # 변환 적용\n", - " for col in categorical_cols:\n", - " X_train[col] = label_encoders[col].transform(X_train[col])\n", - " X_val[col] = label_encoders[col].transform(X_val[col])\n", - " X_test[col] = label_encoders[col].transform(X_test[col])\n", - "\n", - " # 연속형 변수 Standard Scaling\n", - " scaler = StandardScaler()\n", - " scaler.fit(X_train[numerical_cols]) # Train 데이터 기준으로 학습\n", - "\n", - " # 변환 적용\n", - " X_train[numerical_cols] = scaler.transform(X_train[numerical_cols])\n", - " X_val[numerical_cols] = scaler.transform(X_val[numerical_cols])\n", - " X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])\n", - "\n", - " # 연속형 변수와 범주형 변수 분리\n", - " X_train_num = torch.tensor(X_train[numerical_cols].values, dtype=torch.float32)\n", - " X_train_cat = torch.tensor(X_train[categorical_cols].values, dtype=torch.long)\n", - "\n", - " X_val_num = torch.tensor(X_val[numerical_cols].values, dtype=torch.float32)\n", - " X_val_cat = torch.tensor(X_val[categorical_cols].values, dtype=torch.long)\n", - "\n", - " X_test_num = torch.tensor(X_test[numerical_cols].values, dtype=torch.float32)\n", - " X_test_cat = torch.tensor(X_test[categorical_cols].values, dtype=torch.long)\n", - "\n", - " # 레이블 변환\n", - " if target == \"binary\":\n", - " y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32) # 이진 분류 → float32\n", - " y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)\n", - " y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)\n", - " elif target == \"multi\":\n", - " y_train_tensor = torch.tensor(y_train.values, dtype=torch.long) # 다중 분류 → long\n", - " y_val_tensor = torch.tensor(y_val.values, dtype=torch.long)\n", - " y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)\n", - " else:\n", - " raise ValueError(\"target must be 'binary' or 'multi'\")\n", - "\n", - " # TensorDataset 생성\n", - " train_dataset = TensorDataset(X_train_num, X_train_cat, y_train_tensor)\n", - " val_dataset = TensorDataset(X_val_num, X_val_cat, y_val_tensor)\n", - " test_dataset = TensorDataset(X_test_num, X_test_cat, y_test_tensor)\n", - "\n", - " # DataLoader 생성\n", - " if random_state == None:\n", - " train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)\n", - " else:\n", - " train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, generator=torch.Generator().manual_seed(random_state))\n", - " val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)\n", - " test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)\n", - " \n", - " return X_train, categorical_cols, numerical_cols, train_loader, val_loader, test_loader" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### 사용자 정의 성능지표 함수" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.metrics import confusion_matrix\n", - "from sklearn.utils.class_weight import compute_class_weight\n", - "\n", - "def calculate_csi(Y_test, pred):\n", - "\n", - " cm = confusion_matrix(Y_test, pred) # 변수 이름을 cm으로 변경\n", - " # 혼동 행렬에서 H, F, M 추출\n", - " H = (cm[0, 0] + cm[1, 1])\n", - " \n", - " F = (cm[1, 0] + cm[2, 0] +\n", - " cm[0, 1] + cm[2, 1])\n", - " \n", - " M = (cm[0, 2] + cm[1, 2])\n", - " \n", - " # CSI 계산\n", - " CSI = H / (H + F + M + 1e-10)\n", - " return CSI\n", - "\n", - "def eval_metric_csi(y_true, pred_prob):\n", - "\n", - " pred = np.argmax(pred_prob, axis=1)\n", - " y_true = y_true\n", - " y_pred = pred\n", - " csi = calculate_csi(y_true, y_pred)\n", - " return -1*csi\n", - "\n", - "def sample_weight(y_train):\n", - " class_weights = compute_class_weight(\n", - " class_weight='balanced',\n", - " classes=np.unique(y_train), # 고유 클래스\n", - " y=y_train # 학습 데이터 레이블\n", - " )\n", - " sample_weights = np.array([class_weights[label] for label in y_train])\n", - "\n", - " return sample_weights" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### 통일 하이퍼파라미터 최적화 함수 정의" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'%pip install --upgrade ipywidgets'" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "'''%pip install --upgrade ipywidgets'''" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "import optuna\n", - "from sklearn.metrics import accuracy_score, f1_score\n", - "import torch\n", - "import torch.nn as nn\n", - "import torch.optim as optim\n", - "from ft_transformer import FTTransformer\n", - "from resnet_like import ResNetLike\n", - "from deepgbm import DeepGBM\n", - "\n", - "# Optuna의 Trial 로그 숨기기 (WARNING 레벨 이상만 출력)\n", - "optuna.logging.set_verbosity(optuna.logging.WARNING)\n", - "\n", - "# 모델을 GPU로 전송\n", - "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", - "\n", - "# 하이퍼파라미터 최적화 함수 정의\n", - "def objective(trial, model_choose, region, data_sample='pure', target='multi', n_folds=3, random_state=None):\n", - "\n", - " val_scores = []\n", - "\n", - " # fold별로 반복\n", - " for fold in range(1, n_folds+1):\n", - " X_train, categorical_cols, numerical_cols, train_loader, val_loader, _ = prepare_dataloader(region, data_sample=data_sample, target=target, fold=fold, random_state=random_state)\n", - "\n", - " if model_choose == \"ft_transformer\":\n", - " d_token = trial.suggest_categorical(\"d_token\", [64, 128, 192, 256])\n", - " n_blocks = trial.suggest_int(\"n_blocks\", 4, 8)\n", - " attention_dropout = trial.suggest_float(\"attention_dropout\", 0.2, 0.5)\n", - " ffn_dropout = trial.suggest_float(\"ffn_dropout\", 0.2, 0.5)\n", - " lr = trial.suggest_float(\"lr\", 1e-4, 1e-3, log=True)\n", - " weight_decay = trial.suggest_float(\"weight_decay\", 1e-5, 1e-3, log=True)\n", - "\n", - " # FT-Transformer 초기화(다중분류: 3개 범주)\n", - " model = FTTransformer(\n", - " num_features=len(numerical_cols),\n", - " cat_cardinalities=[len(X_train[col].unique()) for col in categorical_cols],\n", - " d_token=d_token,\n", - " n_blocks=n_blocks,\n", - " attention_dropout=attention_dropout,\n", - " ffn_dropout=ffn_dropout,\n", - " num_classes=3\n", - " ).to(device)\n", - "\n", - " elif model_choose == 'resnet_like':\n", - " # 하이퍼파라미터 탐색 공간 정의\n", - " d_main = trial.suggest_categorical(\"d_main\", [64, 128, 192, 256])\n", - " d_hidden = trial.suggest_categorical(\"d_hidden\", [32, 64, 128])\n", - " n_blocks = trial.suggest_int(\"n_blocks\", 3, 8) # ResNet 블록 수\n", - " dropout_first = trial.suggest_float(\"dropout_first\", 0.1, 0.5) # 첫 번째 Dropout\n", - " dropout_second = trial.suggest_float(\"dropout_second\", 0.0, 0.3) # 두 번째 Dropout\n", - " lr = trial.suggest_float(\"lr\", 1e-4, 1e-2, log=True) # 학습률\n", - " weight_decay = trial.suggest_float(\"weight_decay\", 1e-6, 1e-3, log=True) # L2 정규화\n", - "\n", - " # 연속형 변수 + 범주형 변수 개수 반영하여 모델 입력 크기 설정\n", - " input_dim = len(numerical_cols) + len(categorical_cols)\n", - "\n", - " # 모델 초기화 및 GPU로 이동\n", - " model = ResNetLike(\n", - " input_dim=input_dim,\n", - " d_main=d_main, \n", - " d_hidden=d_hidden, \n", - " n_blocks=n_blocks, \n", - " dropout_first=dropout_first, \n", - " dropout_second=dropout_second,\n", - " num_classes=3\n", - " ).to(device)\n", - "\n", - " elif model_choose == 'deepgbm':\n", - " d_main = trial.suggest_categorical(\"d_main\", [64, 128, 192, 256])\n", - " d_hidden = trial.suggest_categorical(\"d_hidden\", [32, 64, 128])\n", - " n_blocks = trial.suggest_int(\"n_blocks\", 3, 8) # ResNet 블록 개수\n", - " dropout = trial.suggest_float(\"dropout\", 0.1, 0.5) # Dropout 비율\n", - " lr = trial.suggest_float(\"lr\", 1e-4, 1e-3, log=True) # 학습률\n", - " weight_decay = trial.suggest_float(\"weight_decay\", 1e-5, 1e-3, log=True) # 정규화\n", - "\n", - " # DeepGBM 모델 초기화 (x_num, x_cat을 따로 받는 구조)\n", - " model = DeepGBM(\n", - " num_features=len(numerical_cols),\n", - " cat_features=[len(X_train[col].unique()) for col in categorical_cols],\n", - " d_main=d_main,\n", - " d_hidden=d_hidden,\n", - " n_blocks=n_blocks,\n", - " dropout=dropout,\n", - " num_classes=3\n", - " ).to(device)\n", - "\n", - " # 손실 함수 및 옵티마이저 설정\n", - " if target == 'binary':\n", - " criterion = nn.BCEWithLogitsLoss() # 이진 분류용\n", - " elif target == 'multi':\n", - " criterion = nn.CrossEntropyLoss() # 다중 분류용\n", - "\n", - " # 가중치 조정\n", - " optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)\n", - "\n", - " # 학습 설정\n", - " epochs = 50 # epoch 증가\n", - " patience = 8 # Early Stopping 기준 (8 epoch 동안 개선 없으면 중지)\n", - " best_val_score = 0 \n", - " counter = 0 \n", - "\n", - " for epoch in range(epochs):\n", - " model.train()\n", - " for x_num_batch, x_cat_batch, y_batch in train_loader:\n", - " x_num_batch, x_cat_batch, y_batch = (\n", - " x_num_batch.to(device),\n", - " x_cat_batch.to(device),\n", - " y_batch.to(device)\n", - " )\n", - " optimizer.zero_grad()\n", - " y_pred = model(x_num_batch, x_cat_batch)\n", - "\n", - " # 손실 계산 (이진 분류 | 다중 분류)\n", - " if target == 'binary':\n", - " loss = criterion(y_pred, y_batch.float())\n", - " elif target == 'multi':\n", - " loss = criterion(y_pred, y_batch)\n", - "\n", - " loss.backward()\n", - " optimizer.step()\n", - "\n", - " # Validation 평가\n", - " model.eval()\n", - " y_pred_val, y_true_val = [], []\n", - " with torch.no_grad():\n", - " for x_num_batch, x_cat_batch, y_batch in val_loader:\n", - " x_num_batch, x_cat_batch, y_batch = (\n", - " x_num_batch.to(device),\n", - " x_cat_batch.to(device),\n", - " y_batch.to(device)\n", - " )\n", - " output = model(x_num_batch, x_cat_batch)\n", - "\n", - " if target == 'binary':\n", - " pred = (torch.sigmoid(output) >= 0.5).long()\n", - " elif target == 'multi':\n", - " pred = output.argmax(dim=1)\n", - "\n", - " y_pred_val.extend(pred.cpu().numpy()) \n", - " y_true_val.extend(y_batch.cpu().numpy())\n", - "\n", - " # csi-score 계산 (다중클래스용)\n", - " val_csi = calculate_csi(y_true_val, y_pred_val) \n", - "\n", - " # Optuna Pruning 적용 (조기 종료)\n", - " trial.report(val_csi, epoch)\n", - " if trial.should_prune():\n", - " raise optuna.exceptions.TrialPruned()\n", - "\n", - " # Early Stopping 체크\n", - " if val_csi > best_val_score:\n", - " best_val_score = val_csi\n", - " counter = 0 # 개선되었으므로 카운터 초기화\n", - " else:\n", - " counter += 1 # 개선되지 않으면 카운터 증가\n", - "\n", - " if counter >= patience:\n", - " break # Early Stopping 발동\n", - "\n", - " val_scores.append(best_val_score)\n", - "\n", - " # 모든 fold에서 평균 성능을 반환\n", - " avg_val_score = sum(val_scores) / len(val_scores)\n", - " return avg_val_score" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### 통일 최적화 + soft voting 함수 정의" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "import optuna\n", - "from sklearn.metrics import accuracy_score, f1_score\n", - "import torch\n", - "import torch.nn as nn\n", - "import torch.optim as optim\n", - "from ft_transformer import FTTransformer\n", - "from resnet_like import ResNetLike\n", - "from deepgbm import DeepGBM\n", - "import copy\n", - "import os\n", - "\n", - "# 모델을 GPU로 전송\n", - "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", - "\n", - "def fold_voting(model_choose, region, data_sample='pure', target='multi', n_folds=3, random_state = None):\n", - "\n", - " # Optuna 실행\n", - " sampler = optuna.samplers.TPESampler(seed=seed)\n", - " study = optuna.create_study(direction=\"maximize\", sampler=sampler)\n", - " study.optimize(lambda trial: objective(trial, model_choose=model_choose, region=region, data_sample=data_sample, target=target, n_folds=n_folds, random_state=random_state), n_trials=50, show_progress_bar=True)\n", - "\n", - " # 최적의 하이퍼파라미터 가져오기\n", - " best_params = study.best_trial.params\n", - " print(f\"### Best Params (All Folds): {best_params} ###\")\n", - "\n", - " model_paths = []\n", - "\n", - " for fold in range(1, n_folds + 1):\n", - " X_train, categorical_cols, numerical_cols, train_loader, val_loader, _ = prepare_dataloader(region=region, data_sample=data_sample, target=target, fold=fold, random_state=seed)\n", - "\n", - " # 구현모델 선택\n", - " if model_choose == 'ft_transformer':\n", - " # FT-Transformer 초기화 (최적화된 하이퍼파라미터로 설정)\n", - " model = FTTransformer(\n", - " num_features=len(numerical_cols),\n", - " cat_cardinalities=[len(X_train[col].unique()) for col in categorical_cols],\n", - " d_token=best_params[\"d_token\"],\n", - " n_blocks=best_params[\"n_blocks\"],\n", - " attention_dropout=best_params[\"attention_dropout\"],\n", - " ffn_dropout=best_params[\"ffn_dropout\"],\n", - " num_classes=3\n", - " ).to(device)\n", - " elif model_choose == 'resnet_like':\n", - " # ResNet-Like 초기화 (최적화된 하이퍼파라미터로 설정)\n", - " model = ResNetLike(\n", - " input_dim=len(numerical_cols) + len(categorical_cols), # 입력 차원\n", - " d_main=best_params[\"d_main\"],\n", - " d_hidden=best_params[\"d_hidden\"],\n", - " n_blocks=best_params[\"n_blocks\"],\n", - " dropout_first=best_params[\"dropout_first\"],\n", - " dropout_second=best_params[\"dropout_second\"],\n", - " num_classes=3\n", - " ).to(device)\n", - " elif model_choose == 'deepgbm':\n", - " # DeepGBM 초기화 (최적화된 하이퍼파라미터로 설정)\n", - " model = DeepGBM(\n", - " num_features=len(numerical_cols),\n", - " cat_features=[len(X_train[col].unique()) for col in categorical_cols],\n", - " d_main=best_params[\"d_main\"],\n", - " d_hidden=best_params[\"d_hidden\"],\n", - " n_blocks=best_params[\"n_blocks\"],\n", - " dropout=best_params[\"dropout\"],\n", - " num_classes=3\n", - " ).to(device)\n", - "\n", - " # 손실 함수 및 옵티마이저 설정\n", - " if target == 'binary':\n", - " criterion = nn.BCEWithLogitsLoss() # 이진 분류용\n", - " elif target == 'multi':\n", - " criterion = nn.CrossEntropyLoss() # 다중 분류용\n", - " optimizer_ft = optim.AdamW(model.parameters(), lr=best_params[\"lr\"], weight_decay=best_params[\"weight_decay\"])\n", - "\n", - " # Early Stopping 설정\n", - " best_csi = -float('inf') # CSI-Score는 최대화가 목표이므로 -inf로 초기화\n", - " patience = 10 # F1-Score가 개선되지 않는 Epoch 수\n", - " counter = 0 # 개선되지 않은 Epoch 수를 기록\n", - " best_model = None\n", - "\n", - " # 학습 루프\n", - " epochs = 50 # 최대 Epoch 수\n", - " for epoch in range(epochs):\n", - " # Training Phase\n", - " model.train()\n", - " for x_num_batch, x_cat_batch, y_batch in train_loader:\n", - " x_num_batch, x_cat_batch, y_batch = (\n", - " x_num_batch.to(device),\n", - " x_cat_batch.to(device),\n", - " y_batch.to(device),\n", - " )\n", - " optimizer_ft.zero_grad()\n", - " y_pred = model(x_num_batch, x_cat_batch)\n", - " \n", - " # 손실 계산 (이진 분류 | 다중 분류)\n", - " if target == 'binary':\n", - " loss = criterion(y_pred.squeeze(-1), y_batch.float())\n", - " elif target == 'multi':\n", - " loss = criterion(y_pred, y_batch)\n", - "\n", - " loss.backward()\n", - " optimizer_ft.step()\n", - "\n", - " # Validation Phase\n", - " model.eval()\n", - " y_true_val, y_pred_val = [], []\n", - " with torch.no_grad():\n", - " for x_num_batch, x_cat_batch, y_batch in val_loader:\n", - " x_num_batch, x_cat_batch, y_batch = (\n", - " x_num_batch.to(device),\n", - " x_cat_batch.to(device),\n", - " y_batch.to(device),\n", - " )\n", - " y_pred = model(x_num_batch, x_cat_batch)\n", - "\n", - " if target == 'binary':\n", - " pred = (torch.sigmoid(y_pred) >= 0.5).long()\n", - " elif target == 'multi':\n", - " pred = y_pred.argmax(dim=1)\n", - " y_true_val.extend(y_batch.cpu().numpy())\n", - " y_pred_val.extend(pred.cpu().numpy()) # 가장 높은 확률의 클래스 선택\n", - "\n", - " # CSI-Score 계산\n", - " val_csi = calculate_csi(y_true_val, y_pred_val)\n", - "\n", - " # Early Stopping 체크\n", - " if val_csi > best_csi:\n", - " best_csi = val_csi\n", - " counter = 0\n", - " best_model = copy.deepcopy(model)\n", - " else:\n", - " counter += 1\n", - " if counter >= patience:\n", - " print(f\"Early stopping at epoch {epoch+1}\")\n", - " break\n", - " \n", - " # 모델 저장 경로 설정\n", - " save_dir = f\"./save_model/{model_choose}/{data_sample}\"\n", - " os.makedirs(save_dir, exist_ok=True) # 폴더 없으면 자동 생성\n", - "\n", - " # 모델 저장\n", - " model_path = f\"./save_model/{model_choose}/{data_sample}/{region}_fold{fold}.pth\"\n", - " torch.save(best_model, model_path)\n", - " model_paths.append(model_path)\n", - " print(f\"Saving model to {model_path}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 사용자 soft voting 정의 함수" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "import glob\n", - "\n", - "# 모델을 GPU로 전송\n", - "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", - "\n", - "# Soft Voting 앙상블\n", - "def pred_fold(region, model_choose, data_sample, fold, target='multi'):\n", - " _, _, _, _, _, y_test, _, _ = prepare_dataset(region=region, data_sample=data_sample, target=target)\n", - " _, _, _, _, _, test_loader = prepare_dataloader(region=region, data_sample=data_sample, target=target, random_state=seed)\n", - "\n", - " folder_path = f'./save_model/{model_choose}/{data_sample}'\n", - " model_paths = [path for path in glob.glob(f'{folder_path}/*.pth') if f'{region}' in path]\n", - "\n", - " model = torch.load(model_paths[fold-1], weights_only=False).to(device)\n", - " model.eval()\n", - "\n", - " test_preds = []\n", - " with torch.no_grad():\n", - " for x_num_batch, x_cat_batch, _ in test_loader:\n", - " output = model(x_num_batch.to(device), x_cat_batch.to(device))\n", - " output = torch.softmax(output, dim=1)\n", - " test_preds.extend(output.cpu().numpy())\n", - "\n", - " # 최종 예측 (Soft Voting)\n", - " final_preds = np.argmax(test_preds, axis=1)\n", - "\n", - " return y_test, final_preds" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "import glob\n", - "\n", - "# 모델을 GPU로 전송\n", - "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", - "\n", - "# Soft Voting 앙상블\n", - "def soft_voting(region, model_choose, data_sample, target='multi'):\n", - " _, _, _, _, _, y_test, _, _ = prepare_dataset(region=region, data_sample=data_sample, target=target)\n", - " _, _, _, _, _, test_loader = prepare_dataloader(region=region, data_sample=data_sample, target=target, random_state=seed)\n", - "\n", - " folder_path = f'./save_model/{model_choose}/{data_sample}'\n", - " model_paths = [path for path in glob.glob(f'{folder_path}/*.pth') if f'{region}' in path]\n", - "\n", - " if target == 'multi':\n", - " test_probs = np.zeros((len(y_test), 3))\n", - " elif target == 'binary':\n", - " test_probs = np.zeros((len(y_test), 2))\n", - "\n", - " for _, path in enumerate(model_paths):\n", - " model = torch.load(path, weights_only=False).to(device)\n", - " model.eval()\n", - "\n", - " test_preds = []\n", - " with torch.no_grad():\n", - " for x_num_batch, x_cat_batch, _ in test_loader:\n", - " output = model(x_num_batch.to(device), x_cat_batch.to(device))\n", - " output = torch.softmax(output, dim=1)\n", - " test_preds.extend(output.cpu().numpy())\n", - "\n", - " test_probs += np.array(test_preds) / len(model_paths)\n", - "\n", - " # 최종 예측 (Soft Voting)\n", - " final_preds = np.argmax(test_probs, axis=1)\n", - "\n", - " return y_test, test_probs, final_preds" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 모델 별 K-fold + Soft Voting 진행" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for model_choose in ['ft_transformer','resnet_like','deepgbm']:\n", - " for region in ['seoul','busan','daejeon','daegu','incheon','gwangju']:\n", - " for data_sample in ['pure','smote','ctgan7000','ctgan10000','ctgan20000']:\n", - " fold_voting(model_choose=model_choose, region=region, data_sample=data_sample, random_state=seed)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "230879b7c5e641c8b8b61492ae92c3e4", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/50 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### Best Params (All Folds): {'d_main': 128, 'd_hidden': 64, 'n_blocks': 3, 'dropout': 0.29807076404450805, 'lr': 0.00010824018381500966, 'weight_decay': 0.000658628931758311} ###\n", - "Early stopping at epoch 22\n", - "Saving model to ./save_model/deepgbm/pure/seoul_fold1.pth\n", - "Early stopping at epoch 50\n", - "Saving model to ./save_model/deepgbm/pure/seoul_fold2.pth\n", - "Early stopping at epoch 24\n", - "Saving model to ./save_model/deepgbm/pure/seoul_fold3.pth\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "53d2758c71964eb28e446dae5138a3e9", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/50 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### Best Params (All Folds): {'d_main': 256, 'd_hidden': 128, 'n_blocks': 4, 'dropout': 0.18531333425244892, 'lr': 0.00019904391652517882, 'weight_decay': 7.803511669278675e-05} ###\n", - "Early stopping at epoch 12\n", - "Saving model to ./save_model/deepgbm/smote/seoul_fold1.pth\n", - "Early stopping at epoch 14\n", - "Saving model to ./save_model/deepgbm/smote/seoul_fold2.pth\n", - "Early stopping at epoch 19\n", - "Saving model to ./save_model/deepgbm/smote/seoul_fold3.pth\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "1a34a1880a7c43a69835c3f95784175e", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/50 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### Best Params (All Folds): {'d_main': 128, 'd_hidden': 32, 'n_blocks': 8, 'dropout': 0.10718475024592788, 'lr': 0.0009432177379597497, 'weight_decay': 4.308729113230509e-05} ###\n", - "Early stopping at epoch 13\n", - "Saving model to ./save_model/deepgbm/ctgan7000/seoul_fold1.pth\n", - "Early stopping at epoch 19\n", - "Saving model to ./save_model/deepgbm/ctgan7000/seoul_fold2.pth\n", - "Early stopping at epoch 18\n", - "Saving model to ./save_model/deepgbm/ctgan7000/seoul_fold3.pth\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "3ec2807729c641f59163b26756414b28", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/50 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### Best Params (All Folds): {'d_main': 256, 'd_hidden': 64, 'n_blocks': 3, 'dropout': 0.14063926069511057, 'lr': 0.0004059290878693028, 'weight_decay': 0.0008509845719526007} ###\n", - "Early stopping at epoch 14\n", - "Saving model to ./save_model/deepgbm/ctgan10000/seoul_fold1.pth\n", - "Early stopping at epoch 18\n", - "Saving model to ./save_model/deepgbm/ctgan10000/seoul_fold2.pth\n", - "Early stopping at epoch 26\n", - "Saving model to ./save_model/deepgbm/ctgan10000/seoul_fold3.pth\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4d7a3ceee7854891aa2cfef6265ef833", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/50 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### Best Params (All Folds): {'d_main': 128, 'd_hidden': 32, 'n_blocks': 8, 'dropout': 0.34044600469728353, 'lr': 0.0005105903209394755, 'weight_decay': 1.0994335574766187e-05} ###\n", - "Early stopping at epoch 35\n", - "Saving model to ./save_model/deepgbm/ctgan20000/seoul_fold1.pth\n", - "Early stopping at epoch 48\n", - "Saving model to ./save_model/deepgbm/ctgan20000/seoul_fold2.pth\n", - "Early stopping at epoch 13\n", - "Saving model to ./save_model/deepgbm/ctgan20000/seoul_fold3.pth\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d941a8b45f5b47aba5e38e89ba96e5bc", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/50 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### Best Params (All Folds): {'d_main': 128, 'd_hidden': 32, 'n_blocks': 8, 'dropout': 0.4651768944020654, 'lr': 0.0005440940639887522, 'weight_decay': 1.0703736319022912e-05} ###\n", - "Early stopping at epoch 18\n", - "Saving model to ./save_model/deepgbm/pure/busan_fold1.pth\n", - "Early stopping at epoch 21\n", - "Saving model to ./save_model/deepgbm/pure/busan_fold2.pth\n", - "Early stopping at epoch 26\n", - "Saving model to ./save_model/deepgbm/pure/busan_fold3.pth\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "cd7785fd4688437bad84b4cd3dd17402", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/50 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### Best Params (All Folds): {'d_main': 256, 'd_hidden': 32, 'n_blocks': 7, 'dropout': 0.36241152483468775, 'lr': 0.0005993036540724003, 'weight_decay': 0.0006194094729468491} ###\n", - "Early stopping at epoch 22\n", - "Saving model to ./save_model/deepgbm/smote/busan_fold1.pth\n", - "Early stopping at epoch 15\n", - "Saving model to ./save_model/deepgbm/smote/busan_fold2.pth\n", - "Early stopping at epoch 20\n", - "Saving model to ./save_model/deepgbm/smote/busan_fold3.pth\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4236916937fa4d299507f7bf67480433", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/50 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### Best Params (All Folds): {'d_main': 128, 'd_hidden': 32, 'n_blocks': 5, 'dropout': 0.32589216101245205, 'lr': 0.000870943966836362, 'weight_decay': 0.0004755258531422745} ###\n", - "Early stopping at epoch 32\n", - "Saving model to ./save_model/deepgbm/ctgan7000/busan_fold1.pth\n", - "Early stopping at epoch 41\n", - "Saving model to ./save_model/deepgbm/ctgan7000/busan_fold2.pth\n", - "Early stopping at epoch 18\n", - "Saving model to ./save_model/deepgbm/ctgan7000/busan_fold3.pth\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "a58c944a170f4a27939007d870df4d3b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/50 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### Best Params (All Folds): {'d_main': 128, 'd_hidden': 64, 'n_blocks': 5, 'dropout': 0.22554979058504504, 'lr': 0.0007872290481937401, 'weight_decay': 1.3478651710155104e-05} ###\n", - "Early stopping at epoch 30\n", - "Saving model to ./save_model/deepgbm/ctgan10000/busan_fold1.pth\n", - "Early stopping at epoch 15\n", - "Saving model to ./save_model/deepgbm/ctgan10000/busan_fold2.pth\n", - "Early stopping at epoch 17\n", - "Saving model to ./save_model/deepgbm/ctgan10000/busan_fold3.pth\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "794104daa0814654b5a7374e63bad90b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/50 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### Best Params (All Folds): {'d_main': 256, 'd_hidden': 32, 'n_blocks': 4, 'dropout': 0.23007332881069884, 'lr': 0.0005365450324352025, 'weight_decay': 0.00018841476921545086} ###\n", - "Early stopping at epoch 34\n", - "Saving model to ./save_model/deepgbm/ctgan20000/busan_fold1.pth\n", - "Early stopping at epoch 21\n", - "Saving model to ./save_model/deepgbm/ctgan20000/busan_fold2.pth\n", - "Saving model to ./save_model/deepgbm/ctgan20000/busan_fold3.pth\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "516dd122b4cf443eaa6514976e0ba235", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/50 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### Best Params (All Folds): {'d_main': 128, 'd_hidden': 32, 'n_blocks': 8, 'dropout': 0.34044600469728353, 'lr': 0.0005105903209394755, 'weight_decay': 1.0994335574766187e-05} ###\n", - "Early stopping at epoch 18\n", - "Saving model to ./save_model/deepgbm/pure/daejeon_fold1.pth\n", - "Early stopping at epoch 20\n", - "Saving model to ./save_model/deepgbm/pure/daejeon_fold2.pth\n", - "Early stopping at epoch 25\n", - "Saving model to ./save_model/deepgbm/pure/daejeon_fold3.pth\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "9e3bf1f1058f4dcfa1254f06dcb0fe38", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/50 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### Best Params (All Folds): {'d_main': 64, 'd_hidden': 128, 'n_blocks': 5, 'dropout': 0.2994658107096196, 'lr': 0.0005068273287449525, 'weight_decay': 2.0471105563346853e-05} ###\n", - "Early stopping at epoch 16\n", - "Saving model to ./save_model/deepgbm/smote/daejeon_fold1.pth\n", - "Early stopping at epoch 16\n", - "Saving model to ./save_model/deepgbm/smote/daejeon_fold2.pth\n", - "Early stopping at epoch 21\n", - "Saving model to ./save_model/deepgbm/smote/daejeon_fold3.pth\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d188977daf5147fe8ed6dffd4d233c06", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/50 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### Best Params (All Folds): {'d_main': 256, 'd_hidden': 32, 'n_blocks': 7, 'dropout': 0.38274293753904687, 'lr': 0.0005358055009231865, 'weight_decay': 0.000348771262454593} ###\n", - "Early stopping at epoch 31\n", - "Saving model to ./save_model/deepgbm/ctgan7000/daejeon_fold1.pth\n", - "Early stopping at epoch 23\n", - "Saving model to ./save_model/deepgbm/ctgan7000/daejeon_fold2.pth\n", - "Early stopping at epoch 16\n", - "Saving model to ./save_model/deepgbm/ctgan7000/daejeon_fold3.pth\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "76f8c4fd7727459ba4d87aada9cbcf87", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/50 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### Best Params (All Folds): {'d_main': 128, 'd_hidden': 32, 'n_blocks': 3, 'dropout': 0.37335751073629203, 'lr': 0.0005518803643548146, 'weight_decay': 0.0006506083261092598} ###\n", - "Early stopping at epoch 42\n", - "Saving model to ./save_model/deepgbm/ctgan10000/daejeon_fold1.pth\n", - "Early stopping at epoch 26\n", - "Saving model to ./save_model/deepgbm/ctgan10000/daejeon_fold2.pth\n", - "Early stopping at epoch 33\n", - "Saving model to ./save_model/deepgbm/ctgan10000/daejeon_fold3.pth\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "c2f82c3c6d6a4cf5933e6dd18eede561", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/50 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### Best Params (All Folds): {'d_main': 128, 'd_hidden': 64, 'n_blocks': 7, 'dropout': 0.498315839448228, 'lr': 0.000982377661956639, 'weight_decay': 5.1544513708209705e-05} ###\n", - "Early stopping at epoch 44\n", - "Saving model to ./save_model/deepgbm/ctgan20000/daejeon_fold1.pth\n", - "Early stopping at epoch 21\n", - "Saving model to ./save_model/deepgbm/ctgan20000/daejeon_fold2.pth\n", - "Early stopping at epoch 39\n", - "Saving model to ./save_model/deepgbm/ctgan20000/daejeon_fold3.pth\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "485ab1b95b0446348036e7f85f06670b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/50 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### Best Params (All Folds): {'d_main': 192, 'd_hidden': 128, 'n_blocks': 4, 'dropout': 0.44295645200373956, 'lr': 0.0006995754135310067, 'weight_decay': 7.588529866409616e-05} ###\n", - "Early stopping at epoch 12\n", - "Saving model to ./save_model/deepgbm/pure/daegu_fold1.pth\n", - "Early stopping at epoch 13\n", - "Saving model to ./save_model/deepgbm/pure/daegu_fold2.pth\n", - "Early stopping at epoch 18\n", - "Saving model to ./save_model/deepgbm/pure/daegu_fold3.pth\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "8bf0047f247b4a5caf0f2d12383850cc", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/50 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### Best Params (All Folds): {'d_main': 256, 'd_hidden': 128, 'n_blocks': 3, 'dropout': 0.34301794076057535, 'lr': 0.00014808945119975197, 'weight_decay': 1.3492834268013232e-05} ###\n", - "Early stopping at epoch 25\n", - "Saving model to ./save_model/deepgbm/smote/daegu_fold1.pth\n", - "Early stopping at epoch 24\n", - "Saving model to ./save_model/deepgbm/smote/daegu_fold2.pth\n", - "Early stopping at epoch 23\n", - "Saving model to ./save_model/deepgbm/smote/daegu_fold3.pth\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "cf4d89937031420cab15ecba54ffc850", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/50 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### Best Params (All Folds): {'d_main': 256, 'd_hidden': 128, 'n_blocks': 6, 'dropout': 0.4749441191638353, 'lr': 0.0007742129275419718, 'weight_decay': 0.0001276488991046795} ###\n", - "Early stopping at epoch 17\n", - "Saving model to ./save_model/deepgbm/ctgan7000/daegu_fold1.pth\n", - "Early stopping at epoch 39\n", - "Saving model to ./save_model/deepgbm/ctgan7000/daegu_fold2.pth\n", - "Early stopping at epoch 48\n", - "Saving model to ./save_model/deepgbm/ctgan7000/daegu_fold3.pth\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "c0df822671e74384a66b4b0395e6ab81", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/50 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### Best Params (All Folds): {'d_main': 256, 'd_hidden': 32, 'n_blocks': 8, 'dropout': 0.32191381712732553, 'lr': 0.0006025357742197527, 'weight_decay': 1.6123356366734284e-05} ###\n", - "Early stopping at epoch 25\n", - "Saving model to ./save_model/deepgbm/ctgan10000/daegu_fold1.pth\n", - "Early stopping at epoch 40\n", - "Saving model to ./save_model/deepgbm/ctgan10000/daegu_fold2.pth\n", - "Early stopping at epoch 23\n", - "Saving model to ./save_model/deepgbm/ctgan10000/daegu_fold3.pth\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "9492d14036d64bb7bb2eb8c3d2bee972", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/50 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### Best Params (All Folds): {'d_main': 128, 'd_hidden': 128, 'n_blocks': 7, 'dropout': 0.49897739440085004, 'lr': 0.0009903735540622498, 'weight_decay': 5.1544513708209705e-05} ###\n", - "Early stopping at epoch 29\n", - "Saving model to ./save_model/deepgbm/ctgan20000/daegu_fold1.pth\n", - "Early stopping at epoch 20\n", - "Saving model to ./save_model/deepgbm/ctgan20000/daegu_fold2.pth\n", - "Early stopping at epoch 18\n", - "Saving model to ./save_model/deepgbm/ctgan20000/daegu_fold3.pth\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5c8a303b51f94df288ff928629f0d728", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/50 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### Best Params (All Folds): {'d_main': 128, 'd_hidden': 32, 'n_blocks': 8, 'dropout': 0.34044600469728353, 'lr': 0.0005105903209394755, 'weight_decay': 1.0994335574766187e-05} ###\n", - "Early stopping at epoch 15\n", - "Saving model to ./save_model/deepgbm/pure/incheon_fold1.pth\n", - "Early stopping at epoch 33\n", - "Saving model to ./save_model/deepgbm/pure/incheon_fold2.pth\n", - "Early stopping at epoch 22\n", - "Saving model to ./save_model/deepgbm/pure/incheon_fold3.pth\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "6d66e1cd2b2b4aebac519cc9f774853b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/50 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### Best Params (All Folds): {'d_main': 256, 'd_hidden': 32, 'n_blocks': 6, 'dropout': 0.4453655419744991, 'lr': 0.0006995754135310067, 'weight_decay': 7.03362294158028e-05} ###\n", - "Early stopping at epoch 28\n", - "Saving model to ./save_model/deepgbm/smote/incheon_fold1.pth\n", - "Early stopping at epoch 17\n", - "Saving model to ./save_model/deepgbm/smote/incheon_fold2.pth\n", - "Early stopping at epoch 14\n", - "Saving model to ./save_model/deepgbm/smote/incheon_fold3.pth\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "189a4560776e4601b2490693d9456b91", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/50 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### Best Params (All Folds): {'d_main': 128, 'd_hidden': 64, 'n_blocks': 3, 'dropout': 0.29807076404450805, 'lr': 0.00010824018381500966, 'weight_decay': 0.000658628931758311} ###\n", - "Early stopping at epoch 35\n", - "Saving model to ./save_model/deepgbm/ctgan7000/incheon_fold1.pth\n", - "Early stopping at epoch 28\n", - "Saving model to ./save_model/deepgbm/ctgan7000/incheon_fold2.pth\n", - "Early stopping at epoch 41\n", - "Saving model to ./save_model/deepgbm/ctgan7000/incheon_fold3.pth\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "04414670ee404a1f958e74d114044570", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/50 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### Best Params (All Folds): {'d_main': 128, 'd_hidden': 64, 'n_blocks': 4, 'dropout': 0.4656646404627707, 'lr': 0.000725954595588046, 'weight_decay': 9.156653323796898e-05} ###\n", - "Early stopping at epoch 14\n", - "Saving model to ./save_model/deepgbm/ctgan10000/incheon_fold1.pth\n", - "Early stopping at epoch 25\n", - "Saving model to ./save_model/deepgbm/ctgan10000/incheon_fold2.pth\n", - "Early stopping at epoch 38\n", - "Saving model to ./save_model/deepgbm/ctgan10000/incheon_fold3.pth\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "c03845943e074a83a880a4080d779cfa", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/50 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### Best Params (All Folds): {'d_main': 64, 'd_hidden': 32, 'n_blocks': 3, 'dropout': 0.29122487178013184, 'lr': 0.0008644753350143602, 'weight_decay': 0.0006848083277415096} ###\n", - "Early stopping at epoch 39\n", - "Saving model to ./save_model/deepgbm/ctgan20000/incheon_fold1.pth\n", - "Early stopping at epoch 32\n", - "Saving model to ./save_model/deepgbm/ctgan20000/incheon_fold2.pth\n", - "Early stopping at epoch 44\n", - "Saving model to ./save_model/deepgbm/ctgan20000/incheon_fold3.pth\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4d15a078f9ea42279a268e574300d613", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/50 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### Best Params (All Folds): {'d_main': 192, 'd_hidden': 32, 'n_blocks': 7, 'dropout': 0.4268924061648935, 'lr': 0.00031827191626512363, 'weight_decay': 7.927166314511625e-05} ###\n", - "Early stopping at epoch 41\n", - "Saving model to ./save_model/deepgbm/pure/gwangju_fold1.pth\n", - "Early stopping at epoch 26\n", - "Saving model to ./save_model/deepgbm/pure/gwangju_fold2.pth\n", - "Early stopping at epoch 18\n", - "Saving model to ./save_model/deepgbm/pure/gwangju_fold3.pth\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "718bf72a28814022bd53ea88c94a39da", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/50 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### Best Params (All Folds): {'d_main': 128, 'd_hidden': 64, 'n_blocks': 3, 'dropout': 0.49840679574448543, 'lr': 0.0007932730569025867, 'weight_decay': 5.2371870545865264e-05} ###\n", - "Early stopping at epoch 34\n", - "Saving model to ./save_model/deepgbm/smote/gwangju_fold1.pth\n", - "Early stopping at epoch 34\n", - "Saving model to ./save_model/deepgbm/smote/gwangju_fold2.pth\n", - "Early stopping at epoch 22\n", - "Saving model to ./save_model/deepgbm/smote/gwangju_fold3.pth\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4470573fa1844b6ebf6e59f2ff9d865c", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/50 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### Best Params (All Folds): {'d_main': 64, 'd_hidden': 128, 'n_blocks': 7, 'dropout': 0.3593531534642382, 'lr': 0.0007435971973093597, 'weight_decay': 0.00039310742030944726} ###\n", - "Early stopping at epoch 35\n", - "Saving model to ./save_model/deepgbm/ctgan7000/gwangju_fold1.pth\n", - "Early stopping at epoch 22\n", - "Saving model to ./save_model/deepgbm/ctgan7000/gwangju_fold2.pth\n", - "Early stopping at epoch 29\n", - "Saving model to ./save_model/deepgbm/ctgan7000/gwangju_fold3.pth\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "2f0a9966e1b04380b4abf79e45a022a7", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/50 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### Best Params (All Folds): {'d_main': 128, 'd_hidden': 32, 'n_blocks': 3, 'dropout': 0.3613681971081546, 'lr': 0.0008491548609780148, 'weight_decay': 0.0004487641324799225} ###\n", - "Early stopping at epoch 29\n", - "Saving model to ./save_model/deepgbm/ctgan10000/gwangju_fold1.pth\n", - "Early stopping at epoch 30\n", - "Saving model to ./save_model/deepgbm/ctgan10000/gwangju_fold2.pth\n", - "Early stopping at epoch 42\n", - "Saving model to ./save_model/deepgbm/ctgan10000/gwangju_fold3.pth\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "f3db776ffbfc436c89040085d212e367", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/50 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### Best Params (All Folds): {'d_main': 128, 'd_hidden': 32, 'n_blocks': 6, 'dropout': 0.30240965980622486, 'lr': 0.0008659736510641475, 'weight_decay': 0.0003723756511903005} ###\n", - "Early stopping at epoch 32\n", - "Saving model to ./save_model/deepgbm/ctgan20000/gwangju_fold1.pth\n", - "Early stopping at epoch 31\n", - "Saving model to ./save_model/deepgbm/ctgan20000/gwangju_fold2.pth\n", - "Early stopping at epoch 14\n", - "Saving model to ./save_model/deepgbm/ctgan20000/gwangju_fold3.pth\n" - ] - } - ], - "source": [ - "for region in ['seoul','busan','daejeon','daegu','incheon','gwangju']:\n", - " for data_sample in ['pure','smote','ctgan7000','ctgan10000','ctgan20000']:\n", - " fold_voting(model_choose='deepgbm', region=region, data_sample=data_sample, random_state=seed)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 모델별 지역 성능(원데이터)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### FT-Transformer" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### seoul ###\n", - "Test Accuracy: 0.910958904109589\n", - "Test F1-Score: 0.48826800257083774\n", - "Test CSI: 0.34453781512602144\n", - "### busan ###\n", - "Test Accuracy: 0.9679223744292238\n", - "Test F1-Score: 0.5245668459304146\n", - "Test CSI: 0.4158004158003294\n", - "### daejeon ###\n", - "Test Accuracy: 0.9182648401826484\n", - "Test F1-Score: 0.566496524475418\n", - "Test CSI: 0.31021194605006647\n", - "### daegu ###\n", - "Test Accuracy: 0.9768264840182649\n", - "Test F1-Score: 0.7999244404273532\n", - "Test CSI: 0.26181818181808664\n", - "### incheon ###\n", - "Test Accuracy: 0.9181506849315069\n", - "Test F1-Score: 0.6680714652935209\n", - "Test CSI: 0.5138983050847109\n", - "### gwangju ###\n", - "Test Accuracy: 0.9440639269406392\n", - "Test F1-Score: 0.5373981558893273\n", - "Test CSI: 0.4691224268688549\n" - ] - } - ], - "source": [ - "from sklearn.metrics import accuracy_score, roc_auc_score, f1_score\n", - "\n", - "for region in ['seoul','busan','daejeon','daegu','incheon','gwangju']:\n", - " y_test, test_probs, final_preds = soft_voting(region=region, model_choose='ft_transformer', data_sample='pure')\n", - "\n", - " # 성능 지표 계산\n", - " print(f\"### {region} ###\")\n", - " print(\"Test Accuracy:\", accuracy_score(y_test, final_preds))\n", - " '''print(\"Test AUC:\", roc_auc_score(y_test, test_probs, multi_class=\"ovr\")) # 다중분류 AUC'''\n", - " print(\"Test F1-Score:\", f1_score(y_test, final_preds, average=\"macro\")) # 다중분류에서는 macro-F1 사용\n", - " print(\"Test CSI:\", calculate_csi(y_test, final_preds))\n", - " '''print(confusion_matrix(y_test, final_preds))'''" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### ResNet-like" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### seoul ###\n", - "Test Accuracy: 0.9075342465753424\n", - "Test F1-Score: 0.47993840811230576\n", - "Test CSI: 0.32217573221754625\n", - "### busan ###\n", - "Test Accuracy: 0.968607305936073\n", - "Test F1-Score: 0.586245547831333\n", - "Test CSI: 0.45652173913034455\n", - "### daejeon ###\n", - "Test Accuracy: 0.9060502283105023\n", - "Test F1-Score: 0.6242613500628746\n", - "Test CSI: 0.3560250391236028\n", - "### daegu ###\n", - "Test Accuracy: 0.9698630136986301\n", - "Test F1-Score: 0.7948471717999764\n", - "Test CSI: 0.25212464589227984\n", - "### incheon ###\n", - "Test Accuracy: 0.9184931506849315\n", - "Test F1-Score: 0.6724661948944141\n", - "Test CSI: 0.5188679245282669\n", - "### gwangju ###\n", - "Test Accuracy: 0.9299086757990868\n", - "Test F1-Score: 0.5255850318041286\n", - "Test CSI: 0.4387568555758283\n" - ] - } - ], - "source": [ - "from sklearn.metrics import accuracy_score, roc_auc_score, f1_score\n", - "\n", - "for region in ['seoul','busan','daejeon','daegu','incheon','gwangju']:\n", - " y_test, test_probs, final_preds = soft_voting(region=region, model_choose='resnet_like', data_sample='pure')\n", - "\n", - " # 성능 지표 계산\n", - " print(f\"### {region} ###\")\n", - " print(\"Test Accuracy:\", accuracy_score(y_test, final_preds))\n", - " '''print(\"Test AUC:\", roc_auc_score(y_test, test_probs, multi_class=\"ovr\")) # 다중분류 AUC'''\n", - " print(\"Test F1-Score:\", f1_score(y_test, final_preds, average=\"macro\")) # 다중분류에서는 macro-F1 사용\n", - " print(\"Test CSI:\", calculate_csi(y_test, final_preds))\n", - " '''print(confusion_matrix(y_test, final_preds))'''" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### DeepGBM" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### seoul ###\n", - "Test Accuracy: 0.9140410958904109\n", - "Test F1-Score: 0.48377287268662217\n", - "Test CSI: 0.33066666666663724\n", - "### busan ###\n", - "Test Accuracy: 0.9658675799086758\n", - "Test F1-Score: 0.5404572118702554\n", - "Test CSI: 0.3983903420522337\n", - "### daejeon ###\n", - "Test Accuracy: 0.9071917808219178\n", - "Test F1-Score: 0.5336211256730917\n", - "Test CSI: 0.2701974865349847\n", - "### daegu ###\n", - "Test Accuracy: 0.9742009132420091\n", - "Test F1-Score: 0.4604475684598693\n", - "Test CSI: 0.2416107382549525\n", - "### incheon ###\n", - "Test Accuracy: 0.9174657534246575\n", - "Test F1-Score: 0.6883413153781813\n", - "Test CSI: 0.5147651006711064\n", - "### gwangju ###\n", - "Test Accuracy: 0.9417808219178082\n", - "Test F1-Score: 0.6238007071539367\n", - "Test CSI: 0.4539614561027351\n" - ] - } - ], - "source": [ - "from sklearn.metrics import accuracy_score, roc_auc_score, f1_score\n", - "\n", - "for region in ['seoul','busan','daejeon','daegu','incheon','gwangju']:\n", - " y_test, test_probs, final_preds = soft_voting(region=region, model_choose='deepgbm', data_sample='pure')\n", - "\n", - " # 성능 지표 계산\n", - " print(f\"### {region} ###\")\n", - " print(\"Test Accuracy:\", accuracy_score(y_test, final_preds))\n", - " '''print(\"Test AUC:\", roc_auc_score(y_test, test_probs, multi_class=\"ovr\")) # 다중분류 AUC'''\n", - " print(\"Test F1-Score:\", f1_score(y_test, final_preds, average=\"macro\")) # 다중분류에서는 macro-F1 사용\n", - " print(\"Test CSI:\", calculate_csi(y_test, final_preds))\n", - " '''print(confusion_matrix(y_test, final_preds))'''" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 지역별 모델 성능 비교" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### seoul" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### ft_transformer ###\n", - "Test Accuracy: 0.910958904109589\n", - "Test F1-Score: 0.48826800257083774\n", - "Test CSI: 0.34453781512602144\n", - "### resnet_like ###\n", - "Test Accuracy: 0.9075342465753424\n", - "Test F1-Score: 0.47993840811230576\n", - "Test CSI: 0.32217573221754625\n", - "### deepgbm ###\n", - "Test Accuracy: 0.9140410958904109\n", - "Test F1-Score: 0.48377287268662217\n", - "Test CSI: 0.33066666666663724\n" - ] - } - ], - "source": [ - "from sklearn.metrics import accuracy_score, roc_auc_score, f1_score\n", - "\n", - "for model_choose in ['ft_transformer','resnet_like','deepgbm']:\n", - " y_test, test_probs, final_preds = soft_voting(region='seoul', model_choose=model_choose, data_sample='pure')\n", - "\n", - " # 성능 지표 계산\n", - " print(f\"### {model_choose} ###\")\n", - " print(\"Test Accuracy:\", accuracy_score(y_test, final_preds))\n", - " '''print(\"Test AUC:\", roc_auc_score(y_test, test_probs, multi_class=\"ovr\")) # 다중분류 AUC'''\n", - " print(\"Test F1-Score:\", f1_score(y_test, final_preds, average=\"macro\")) # 다중분류에서는 macro-F1 사용\n", - " print(\"Test CSI:\", calculate_csi(y_test, final_preds))\n", - " '''print(confusion_matrix(y_test, final_preds))'''" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### busan" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### ft_transformer ###\n", - "Test Accuracy: 0.9679223744292238\n", - "Test F1-Score: 0.5245668459304146\n", - "Test CSI: 0.4158004158003294\n", - "### resnet_like ###\n", - "Test Accuracy: 0.968607305936073\n", - "Test F1-Score: 0.586245547831333\n", - "Test CSI: 0.45652173913034455\n", - "### deepgbm ###\n", - "Test Accuracy: 0.9658675799086758\n", - "Test F1-Score: 0.5404572118702554\n", - "Test CSI: 0.3983903420522337\n" - ] - } - ], - "source": [ - "from sklearn.metrics import accuracy_score, roc_auc_score, f1_score\n", - "\n", - "for model_choose in ['ft_transformer','resnet_like','deepgbm']:\n", - " y_test, test_probs, final_preds = soft_voting(region='busan', model_choose=model_choose, data_sample='pure')\n", - "\n", - " # 성능 지표 계산\n", - " print(f\"### {model_choose} ###\")\n", - " print(\"Test Accuracy:\", accuracy_score(y_test, final_preds))\n", - " '''print(\"Test AUC:\", roc_auc_score(y_test, test_probs, multi_class=\"ovr\")) # 다중분류 AUC'''\n", - " print(\"Test F1-Score:\", f1_score(y_test, final_preds, average=\"macro\")) # 다중분류에서는 macro-F1 사용\n", - " print(\"Test CSI:\", calculate_csi(y_test, final_preds))\n", - " '''print(confusion_matrix(y_test, final_preds))'''" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### daejeon" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### ft_transformer ###\n", - "Test Accuracy: 0.9182648401826484\n", - "Test F1-Score: 0.566496524475418\n", - "Test CSI: 0.31021194605006647\n", - "### resnet_like ###\n", - "Test Accuracy: 0.9060502283105023\n", - "Test F1-Score: 0.6242613500628746\n", - "Test CSI: 0.3560250391236028\n", - "### deepgbm ###\n", - "Test Accuracy: 0.9071917808219178\n", - "Test F1-Score: 0.5336211256730917\n", - "Test CSI: 0.2701974865349847\n" - ] - } - ], - "source": [ - "from sklearn.metrics import accuracy_score, roc_auc_score, f1_score\n", - "\n", - "for model_choose in ['ft_transformer','resnet_like','deepgbm']:\n", - " y_test, test_probs, final_preds = soft_voting(region='daejeon', model_choose=model_choose, data_sample='pure')\n", - "\n", - " # 성능 지표 계산\n", - " print(f\"### {model_choose} ###\")\n", - " print(\"Test Accuracy:\", accuracy_score(y_test, final_preds))\n", - " '''print(\"Test AUC:\", roc_auc_score(y_test, test_probs, multi_class=\"ovr\")) # 다중분류 AUC'''\n", - " print(\"Test F1-Score:\", f1_score(y_test, final_preds, average=\"macro\")) # 다중분류에서는 macro-F1 사용\n", - " print(\"Test CSI:\", calculate_csi(y_test, final_preds))\n", - " '''print(confusion_matrix(y_test, final_preds))'''" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### daegu" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### ft_transformer ###\n", - "Test Accuracy: 0.9768264840182649\n", - "Test F1-Score: 0.7999244404273532\n", - "Test CSI: 0.26181818181808664\n", - "### resnet_like ###\n", - "Test Accuracy: 0.9698630136986301\n", - "Test F1-Score: 0.7948471717999764\n", - "Test CSI: 0.25212464589227984\n", - "### deepgbm ###\n", - "Test Accuracy: 0.9742009132420091\n", - "Test F1-Score: 0.4604475684598693\n", - "Test CSI: 0.2416107382549525\n" - ] - } - ], - "source": [ - "from sklearn.metrics import accuracy_score, roc_auc_score, f1_score\n", - "\n", - "for model_choose in ['ft_transformer','resnet_like','deepgbm']:\n", - " y_test, test_probs, final_preds = soft_voting(region='daegu', model_choose=model_choose, data_sample='pure')\n", - "\n", - " # 성능 지표 계산\n", - " print(f\"### {model_choose} ###\")\n", - " print(\"Test Accuracy:\", accuracy_score(y_test, final_preds))\n", - " '''print(\"Test AUC:\", roc_auc_score(y_test, test_probs, multi_class=\"ovr\")) # 다중분류 AUC'''\n", - " print(\"Test F1-Score:\", f1_score(y_test, final_preds, average=\"macro\")) # 다중분류에서는 macro-F1 사용\n", - " print(\"Test CSI:\", calculate_csi(y_test, final_preds))\n", - " '''print(confusion_matrix(y_test, final_preds))'''" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### incheon" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "### ft_transformer ###\n", - "Test Accuracy: 0.9181506849315069\n", - "Test F1-Score: 0.6680714652935209\n", - "Test CSI: 0.5138983050847109\n", - "### resnet_like ###\n", - "Test Accuracy: 0.9184931506849315\n", - "Test F1-Score: 0.6724661948944141\n", - "Test CSI: 0.5188679245282669\n", - "### deepgbm ###\n", - "Test Accuracy: 0.9174657534246575\n", - "Test F1-Score: 0.6883413153781813\n", - "Test CSI: 0.5147651006711064\n" - ] - } - ], - "source": [ - "from sklearn.metrics import accuracy_score, roc_auc_score, f1_score\n", - "\n", - "for model_choose in ['ft_transformer','resnet_like','deepgbm']:\n", - " y_test, test_probs, final_preds = soft_voting(region='incheon', model_choose=model_choose, data_sample='pure')\n", - "\n", - " # 성능 지표 계산\n", - " print(f\"### {model_choose} ###\")\n", - " print(\"Test Accuracy:\", accuracy_score(y_test, final_preds))\n", - " '''print(\"Test AUC:\", roc_auc_score(y_test, test_probs, multi_class=\"ovr\")) # 다중분류 AUC'''\n", - " print(\"Test F1-Score:\", f1_score(y_test, final_preds, average=\"macro\")) # 다중분류에서는 macro-F1 사용\n", - " print(\"Test CSI:\", calculate_csi(y_test, final_preds))\n", - " '''print(confusion_matrix(y_test, final_preds))'''" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### gwangju" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.metrics import accuracy_score, roc_auc_score, f1_score\n", - "\n", - "for model_choose in ['ft_transformer','resnet_like','deepgbm']:\n", - " y_test, test_probs, final_preds = soft_voting(region='gwangju', model_choose=model_choose, data_sample='pure')\n", - "\n", - " # 성능 지표 계산\n", - " print(f\"### {model_choose} ###\")\n", - " print(\"Test Accuracy:\", accuracy_score(y_test, final_preds))\n", - " '''print(\"Test AUC:\", roc_auc_score(y_test, test_probs, multi_class=\"ovr\")) # 다중분류 AUC'''\n", - " print(\"Test F1-Score:\", f1_score(y_test, final_preds, average=\"macro\")) # 다중분류에서는 macro-F1 사용\n", - " print(\"Test CSI:\", calculate_csi(y_test, final_preds))\n", - " '''print(confusion_matrix(y_test, final_preds))'''" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 성능비교 시각화 plot" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
| \n", - " | Region | \n", - "Model | \n", - "CSI | \n", - "
|---|---|---|---|
| 0 | \n", - "seoul | \n", - "ft_transformer | \n", - "0.344538 | \n", - "
| 1 | \n", - "seoul | \n", - "resnet_like | \n", - "0.322176 | \n", - "
| 2 | \n", - "seoul | \n", - "deepgbm | \n", - "0.330667 | \n", - "
| 3 | \n", - "busan | \n", - "ft_transformer | \n", - "0.415800 | \n", - "
| 4 | \n", - "busan | \n", - "resnet_like | \n", - "0.456522 | \n", - "
| 5 | \n", - "busan | \n", - "deepgbm | \n", - "0.398390 | \n", - "
| 6 | \n", - "daejeon | \n", - "ft_transformer | \n", - "0.310212 | \n", - "
| 7 | \n", - "daejeon | \n", - "resnet_like | \n", - "0.356025 | \n", - "
| 8 | \n", - "daejeon | \n", - "deepgbm | \n", - "0.270197 | \n", - "
| 9 | \n", - "daegu | \n", - "ft_transformer | \n", - "0.261818 | \n", - "
| 10 | \n", - "daegu | \n", - "resnet_like | \n", - "0.252125 | \n", - "
| 11 | \n", - "daegu | \n", - "deepgbm | \n", - "0.241611 | \n", - "
| 12 | \n", - "incheon | \n", - "ft_transformer | \n", - "0.513898 | \n", - "
| 13 | \n", - "incheon | \n", - "resnet_like | \n", - "0.518868 | \n", - "
| 14 | \n", - "incheon | \n", - "deepgbm | \n", - "0.514765 | \n", - "
| 15 | \n", - "gwangju | \n", - "ft_transformer | \n", - "0.469122 | \n", - "
| 16 | \n", - "gwangju | \n", - "resnet_like | \n", - "0.438757 | \n", - "
| 17 | \n", - "gwangju | \n", - "deepgbm | \n", - "0.453961 | \n", - "