diff --git "a/Analysis_code/3.oversampling.ipynb" "b/Analysis_code/3.oversampling.ipynb" deleted file mode 100644--- "a/Analysis_code/3.oversampling.ipynb" +++ /dev/null @@ -1,974 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## SMOTENC(단순선형보간증강)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "11000 | 11000 | 21893\n", - "12000 | 12000 | 23686\n", - "12500 | 12500 | 24694\n", - "13000 | 13000 | 25149\n", - "12000 | 12000 | 23471\n", - "12000 | 12000 | 23798\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "\n", - "from imblearn.over_sampling import SMOTENC\n", - "\n", - "# 지역별 데이터 파일 경로\n", - "regions = ['incheon', 'seoul','busan', 'daegu', 'daejeon', 'gwangju']\n", - "file_paths = [f'../data/data_for_modeling/{region}_train.csv' for region in regions]\n", - "output_paths = [f'../data/data_oversampled/smote_{region}.csv' for region in regions]\n", - "\n", - "# 지역별 처리\n", - "for file_path, output_path in zip(file_paths, output_paths):\n", - " # 데이터 로드\n", - " data = pd.read_csv(file_path, index_col=0)\n", - " data['cloudcover'] = data['cloudcover'].astype('int')\n", - " data['lm_cloudcover'] = data['lm_cloudcover'].astype('int')\n", - " X = data.drop(columns=['multi_class', 'binary_class'])\n", - " y = data['multi_class']\n", - "\n", - " # 불필요한 열 제거\n", - " X.drop(columns=['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos'], inplace=True)\n", - "\n", - " # SMOTENC에서 사용할 범주형 변수 열 번호 설정\n", - " categorical_features_indices = [i for i, dtype in enumerate(X.dtypes) if dtype != 'float64']\n", - "\n", - " # sampling_strategy 설정\n", - " count_class_2 = (y == 2).sum()\n", - " sampling_strategy = {\n", - " 0: int(np.ceil(count_class_2 / 1000) * 500),\n", - " 1: int(np.ceil(count_class_2 / 1000) * 500),\n", - " 2: count_class_2\n", - " }\n", - "\n", - " # SMOTENC 적용\n", - " smotenc = SMOTENC(categorical_features=categorical_features_indices, sampling_strategy=sampling_strategy, random_state=42)\n", - " X_resampled, y_resampled = smotenc.fit_resample(X, y)\n", - "\n", - " # Resampled 데이터 생성\n", - " lerp_data = X_resampled.copy()\n", - " lerp_data['multi_class'] = y_resampled\n", - "\n", - " # 제거변수 복구\n", - " lerp_data['binary_class'] = lerp_data['multi_class'].apply(lambda x: 0 if x == 2 else 1)\n", - " lerp_data['hour_sin'] = np.sin(2 * np.pi * lerp_data['hour'] / 24)\n", - " lerp_data['hour_cos'] = np.cos(2 * np.pi * lerp_data['hour'] / 24)\n", - " lerp_data['month_sin'] = np.sin(2 * np.pi * lerp_data['month'] / 12)\n", - " lerp_data['month_cos'] = np.cos(2 * np.pi * lerp_data['month'] / 12)\n", - " lerp_data['ground_temp - temp_C'] = lerp_data['groundtemp'] - lerp_data['temp_C']\n", - "\n", - " # 결과 저장\n", - " lerp_data.to_csv(output_path, index = False)\n", - " print(len(lerp_data[lerp_data['multi_class']==0]),'|',len(lerp_data[lerp_data['multi_class']==1]),'|',len(lerp_data[lerp_data['multi_class']==2]))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## SMOTENC(초기선형보간증강) + CTGAN(조건부증강)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'%pip install imbalanced-learn'" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "'''%pip install imbalanced-learn sdv optuna'''" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 소수범주당 증강 목표 20000개" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Using device: cuda\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[I 2025-01-20 14:56:59,102] A new study created in memory with name: no-name-b76e8d7c-ea9f-42f3-b88f-320929a7e5ac\n", - "[I 2025-01-20 14:57:16,217] Trial 0 finished with value: -37.318639001504366 and parameters: {'embedding_dim': 71, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 256, 'discriminator_steps': 1}. Best is trial 0 with value: -37.318639001504366.\n", - "[I 2025-01-20 14:57:36,855] Trial 1 finished with value: -90.68083127767693 and parameters: {'embedding_dim': 70, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 256, 'discriminator_steps': 2}. Best is trial 0 with value: -37.318639001504366.\n", - "[I 2025-01-20 14:58:03,444] Trial 2 finished with value: -13.074059044651209 and parameters: {'embedding_dim': 84, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 8, 'batch_size': 128, 'discriminator_steps': 1}. Best is trial 2 with value: -13.074059044651209.\n", - "[I 2025-01-20 14:58:54,752] Trial 3 finished with value: -12.274816765539791 and parameters: {'embedding_dim': 128, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 1}. Best is trial 3 with value: -12.274816765539791.\n", - "[I 2025-01-20 14:59:09,155] Trial 4 finished with value: -34.44549346411927 and parameters: {'embedding_dim': 90, 'generator_dim': (64, 64), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 256, 'discriminator_steps': 1}. Best is trial 3 with value: -12.274816765539791.\n", - "[I 2025-01-20 14:59:50,114] Trial 5 finished with value: -9.00948546286418 and parameters: {'embedding_dim': 68, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 2}. Best is trial 5 with value: -9.00948546286418.\n", - "[I 2025-01-20 15:01:11,009] Trial 6 finished with value: -6.388978160012437 and parameters: {'embedding_dim': 118, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 2}. Best is trial 6 with value: -6.388978160012437.\n", - "[I 2025-01-20 15:01:25,167] Trial 7 finished with value: -228.64770439962442 and parameters: {'embedding_dim': 108, 'generator_dim': (128, 128), 'discriminator_dim': (128, 128), 'pac': 8, 'batch_size': 256, 'discriminator_steps': 1}. Best is trial 6 with value: -6.388978160012437.\n", - "[I 2025-01-20 15:01:51,402] Trial 8 finished with value: -5.368727454151448 and parameters: {'embedding_dim': 114, 'generator_dim': (64, 64), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 1}. Best is trial 8 with value: -5.368727454151448.\n", - "[I 2025-01-20 15:02:11,569] Trial 9 finished with value: -278.60033287713617 and parameters: {'embedding_dim': 104, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 256, 'discriminator_steps': 2}. Best is trial 8 with value: -5.368727454151448.\n", - "[I 2025-01-20 15:03:05,202] Trial 10 finished with value: -11.579568231204782 and parameters: {'embedding_dim': 116, 'generator_dim': (64, 64), 'discriminator_dim': (64, 64), 'pac': 8, 'batch_size': 128, 'discriminator_steps': 3}. Best is trial 8 with value: -5.368727454151448.\n", - "[I 2025-01-20 15:04:54,530] Trial 11 finished with value: -51.98893561900591 and parameters: {'embedding_dim': 124, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 3}. Best is trial 8 with value: -5.368727454151448.\n", - "[I 2025-01-20 15:06:15,065] Trial 12 finished with value: -104.10651741905765 and parameters: {'embedding_dim': 112, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 2}. Best is trial 8 with value: -5.368727454151448.\n", - "[I 2025-01-20 15:08:05,216] Trial 13 finished with value: -71.59408383664889 and parameters: {'embedding_dim': 99, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 3}. Best is trial 8 with value: -5.368727454151448.\n", - "[I 2025-01-20 15:08:45,626] Trial 14 finished with value: -110.1562292208483 and parameters: {'embedding_dim': 119, 'generator_dim': (64, 64), 'discriminator_dim': (64, 64), 'pac': 8, 'batch_size': 128, 'discriminator_steps': 2}. Best is trial 8 with value: -5.368727454151448.\n", - "[I 2025-01-20 15:09:36,821] Trial 15 finished with value: -3.8151168836858163 and parameters: {'embedding_dim': 120, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 1}. Best is trial 15 with value: -3.8151168836858163.\n", - "[I 2025-01-20 15:10:03,381] Trial 16 finished with value: -8.68366593294427 and parameters: {'embedding_dim': 101, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 1}. Best is trial 15 with value: -3.8151168836858163.\n", - "[I 2025-01-20 15:10:30,167] Trial 17 finished with value: -48.95835312594844 and parameters: {'embedding_dim': 93, 'generator_dim': (64, 64), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 1}. Best is trial 15 with value: -3.8151168836858163.\n", - "[I 2025-01-20 15:11:21,958] Trial 18 finished with value: -20.81247815272162 and parameters: {'embedding_dim': 108, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 8, 'batch_size': 64, 'discriminator_steps': 1}. Best is trial 15 with value: -3.8151168836858163.\n", - "[I 2025-01-20 15:11:49,059] Trial 19 finished with value: -50.99872228893688 and parameters: {'embedding_dim': 81, 'generator_dim': (64, 64), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 1}. Best is trial 15 with value: -3.8151168836858163.\n", - "[I 2025-01-20 15:13:09,572] Trial 20 finished with value: -5.639452046764817 and parameters: {'embedding_dim': 123, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 2}. Best is trial 15 with value: -3.8151168836858163.\n", - "[I 2025-01-20 15:14:30,274] Trial 21 finished with value: -23.57417977156469 and parameters: {'embedding_dim': 128, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 2}. Best is trial 15 with value: -3.8151168836858163.\n", - "[I 2025-01-20 15:15:22,032] Trial 22 finished with value: -0.6778059786818922 and parameters: {'embedding_dim': 122, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 1}. Best is trial 22 with value: -0.6778059786818922.\n", - "[I 2025-01-20 15:16:13,320] Trial 23 finished with value: -32.5624320546421 and parameters: {'embedding_dim': 113, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 1}. Best is trial 22 with value: -0.6778059786818922.\n", - "[I 2025-01-20 15:17:04,695] Trial 24 finished with value: -5.785384536972532 and parameters: {'embedding_dim': 121, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 1}. Best is trial 22 with value: -0.6778059786818922.\n", - "[I 2025-01-20 15:17:56,139] Trial 25 finished with value: -8.738764219614195 and parameters: {'embedding_dim': 111, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 1}. Best is trial 22 with value: -0.6778059786818922.\n", - "[I 2025-01-20 15:18:22,038] Trial 26 finished with value: -184.45992185820504 and parameters: {'embedding_dim': 105, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 8, 'batch_size': 128, 'discriminator_steps': 1}. Best is trial 22 with value: -0.6778059786818922.\n", - "[I 2025-01-20 15:19:13,003] Trial 27 finished with value: -2.6991205788981727 and parameters: {'embedding_dim': 115, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 1}. Best is trial 22 with value: -0.6778059786818922.\n", - "[I 2025-01-20 15:20:04,624] Trial 28 finished with value: -44.592267621272406 and parameters: {'embedding_dim': 124, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 1}. Best is trial 22 with value: -0.6778059786818922.\n", - "[I 2025-01-20 15:20:56,181] Trial 29 finished with value: -44.22943387558266 and parameters: {'embedding_dim': 117, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 1}. Best is trial 22 with value: -0.6778059786818922.\n", - "[I 2025-01-20 15:21:47,508] Trial 30 finished with value: -29.557955872673972 and parameters: {'embedding_dim': 128, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 1}. Best is trial 22 with value: -0.6778059786818922.\n", - "[I 2025-01-20 15:22:38,696] Trial 31 finished with value: -3.0604149741858233 and parameters: {'embedding_dim': 115, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 1}. Best is trial 22 with value: -0.6778059786818922.\n", - "[I 2025-01-20 15:23:30,109] Trial 32 finished with value: -4.184301463771818 and parameters: {'embedding_dim': 121, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 1}. Best is trial 22 with value: -0.6778059786818922.\n", - "[I 2025-01-20 15:24:21,165] Trial 33 finished with value: -56.76796024388123 and parameters: {'embedding_dim': 78, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 1}. Best is trial 22 with value: -0.6778059786818922.\n", - "[I 2025-01-20 15:25:12,460] Trial 34 finished with value: -68.78895549007119 and parameters: {'embedding_dim': 109, 'generator_dim': (128, 128), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 1}. Best is trial 22 with value: -0.6778059786818922.\n", - "[I 2025-01-20 15:25:26,435] Trial 35 finished with value: -7.698349027772178 and parameters: {'embedding_dim': 115, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 256, 'discriminator_steps': 1}. Best is trial 22 with value: -0.6778059786818922.\n", - "[I 2025-01-20 15:26:17,239] Trial 36 finished with value: -1.92703376893977 and parameters: {'embedding_dim': 125, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 1}. Best is trial 22 with value: -0.6778059786818922.\n", - "[I 2025-01-20 15:27:09,507] Trial 37 finished with value: -342.3563728328815 and parameters: {'embedding_dim': 124, 'generator_dim': (128, 128), 'discriminator_dim': (128, 128), 'pac': 8, 'batch_size': 64, 'discriminator_steps': 1}. Best is trial 22 with value: -0.6778059786818922.\n", - "[I 2025-01-20 15:27:30,145] Trial 38 finished with value: -38.64880022938009 and parameters: {'embedding_dim': 65, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 256, 'discriminator_steps': 2}. Best is trial 22 with value: -0.6778059786818922.\n", - "[I 2025-01-20 15:28:21,527] Trial 39 finished with value: -171.94890295218917 and parameters: {'embedding_dim': 126, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 1}. Best is trial 22 with value: -0.6778059786818922.\n", - "[I 2025-01-20 15:29:42,180] Trial 40 finished with value: -2.797751148097547 and parameters: {'embedding_dim': 95, 'generator_dim': (128, 128), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 2}. Best is trial 22 with value: -0.6778059786818922.\n", - "[I 2025-01-20 15:31:31,734] Trial 41 finished with value: -195.52351726339586 and parameters: {'embedding_dim': 95, 'generator_dim': (128, 128), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 3}. Best is trial 22 with value: -0.6778059786818922.\n", - "[I 2025-01-20 15:32:52,135] Trial 42 finished with value: -66.50269881474058 and parameters: {'embedding_dim': 88, 'generator_dim': (128, 128), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 2}. Best is trial 22 with value: -0.6778059786818922.\n", - "[I 2025-01-20 15:34:11,496] Trial 43 finished with value: -11.503285915156347 and parameters: {'embedding_dim': 89, 'generator_dim': (128, 128), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 2}. Best is trial 22 with value: -0.6778059786818922.\n", - "[I 2025-01-20 15:35:03,425] Trial 44 finished with value: -198.7212272708481 and parameters: {'embedding_dim': 104, 'generator_dim': (128, 128), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 1}. Best is trial 22 with value: -0.6778059786818922.\n", - "[I 2025-01-20 15:35:30,511] Trial 45 finished with value: -5.755500215208933 and parameters: {'embedding_dim': 99, 'generator_dim': (128, 128), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 256, 'discriminator_steps': 3}. Best is trial 22 with value: -0.6778059786818922.\n", - "[I 2025-01-20 15:36:21,350] Trial 46 finished with value: -17.212335519290654 and parameters: {'embedding_dim': 74, 'generator_dim': (128, 128), 'discriminator_dim': (128, 128), 'pac': 8, 'batch_size': 64, 'discriminator_steps': 1}. Best is trial 22 with value: -0.6778059786818922.\n", - "[I 2025-01-20 15:37:42,167] Trial 47 finished with value: -47.30773434273214 and parameters: {'embedding_dim': 85, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 2}. Best is trial 22 with value: -0.6778059786818922.\n", - "[I 2025-01-20 15:38:33,458] Trial 48 finished with value: -4.050649756572355 and parameters: {'embedding_dim': 118, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 1}. Best is trial 22 with value: -0.6778059786818922.\n", - "[I 2025-01-20 15:39:53,817] Trial 49 finished with value: -85.45026059502085 and parameters: {'embedding_dim': 93, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 2}. Best is trial 22 with value: -0.6778059786818922.\n", - "[I 2025-01-20 15:40:45,481] A new study created in memory with name: no-name-9f81a759-8c1c-45e7-9717-966f9494aea6\n", - "[I 2025-01-20 15:42:36,221] Trial 0 finished with value: -75.5212527916712 and parameters: {'embedding_dim': 499, 'generator_dim': (256, 256), 'discriminator_dim': (256, 256), 'pac': 4, 'batch_size': 1024, 'discriminator_steps': 5}. Best is trial 0 with value: -75.5212527916712.\n", - "[I 2025-01-20 15:44:06,113] Trial 1 finished with value: -111.70948723752186 and parameters: {'embedding_dim': 281, 'generator_dim': (256, 256), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 512, 'discriminator_steps': 2}. Best is trial 0 with value: -75.5212527916712.\n", - "[I 2025-01-20 15:48:33,946] Trial 2 finished with value: -479.1722735079276 and parameters: {'embedding_dim': 468, 'generator_dim': (128, 128), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 256, 'discriminator_steps': 4}. Best is trial 0 with value: -75.5212527916712.\n", - "[I 2025-01-20 15:53:59,017] Trial 3 finished with value: -48.69174576070862 and parameters: {'embedding_dim': 219, 'generator_dim': (128, 128), 'discriminator_dim': (128, 128), 'pac': 8, 'batch_size': 256, 'discriminator_steps': 5}. Best is trial 3 with value: -48.69174576070862.\n", - "[I 2025-01-20 15:56:03,676] Trial 4 finished with value: -4.409888912338193 and parameters: {'embedding_dim': 137, 'generator_dim': (256, 256), 'discriminator_dim': (128, 128), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 3}. Best is trial 4 with value: -4.409888912338193.\n", - "[I 2025-01-20 15:56:37,777] Trial 5 finished with value: -471.9978221268116 and parameters: {'embedding_dim': 229, 'generator_dim': (256, 256), 'discriminator_dim': (256, 256), 'pac': 8, 'batch_size': 1024, 'discriminator_steps': 1}. Best is trial 4 with value: -4.409888912338193.\n", - "[I 2025-01-20 15:58:28,153] Trial 6 finished with value: -601.2484927626908 and parameters: {'embedding_dim': 478, 'generator_dim': (128, 128), 'discriminator_dim': (128, 128), 'pac': 8, 'batch_size': 1024, 'discriminator_steps': 5}. Best is trial 4 with value: -4.409888912338193.\n", - "[I 2025-01-20 16:00:31,897] Trial 7 finished with value: -695.4921399608821 and parameters: {'embedding_dim': 450, 'generator_dim': (256, 256), 'discriminator_dim': (256, 256), 'pac': 4, 'batch_size': 512, 'discriminator_steps': 3}. Best is trial 4 with value: -4.409888912338193.\n", - "[I 2025-01-20 16:04:59,739] Trial 8 finished with value: -47.389652192459366 and parameters: {'embedding_dim': 194, 'generator_dim': (256, 256), 'discriminator_dim': (128, 128), 'pac': 8, 'batch_size': 256, 'discriminator_steps': 4}. Best is trial 4 with value: -4.409888912338193.\n", - "[I 2025-01-20 16:07:34,586] Trial 9 finished with value: -1001.9812539137846 and parameters: {'embedding_dim': 355, 'generator_dim': (256, 256), 'discriminator_dim': (256, 256), 'pac': 4, 'batch_size': 256, 'discriminator_steps': 2}. Best is trial 4 with value: -4.409888912338193.\n", - "[I 2025-01-20 16:09:38,375] Trial 10 finished with value: -33.10530614059147 and parameters: {'embedding_dim': 130, 'generator_dim': (128, 128), 'discriminator_dim': (128, 128), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 3}. Best is trial 4 with value: -4.409888912338193.\n", - "[I 2025-01-20 16:11:41,364] Trial 11 finished with value: -362.128696252612 and parameters: {'embedding_dim': 128, 'generator_dim': (128, 128), 'discriminator_dim': (128, 128), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 3}. Best is trial 4 with value: -4.409888912338193.\n", - "[I 2025-01-20 16:13:12,305] Trial 12 finished with value: -328.82671400582115 and parameters: {'embedding_dim': 130, 'generator_dim': (128, 128), 'discriminator_dim': (128, 128), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 2}. Best is trial 4 with value: -4.409888912338193.\n", - "[I 2025-01-20 16:15:47,167] Trial 13 finished with value: -7.9286899630985275 and parameters: {'embedding_dim': 359, 'generator_dim': (128, 128), 'discriminator_dim': (128, 128), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 4}. Best is trial 4 with value: -4.409888912338193.\n", - "[I 2025-01-20 16:18:22,255] Trial 14 finished with value: -342.6117545823074 and parameters: {'embedding_dim': 354, 'generator_dim': (256, 256), 'discriminator_dim': (128, 128), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 4}. Best is trial 4 with value: -4.409888912338193.\n", - "[I 2025-01-20 16:20:57,741] Trial 15 finished with value: -67.35499346617792 and parameters: {'embedding_dim': 391, 'generator_dim': (128, 128), 'discriminator_dim': (128, 128), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 4}. Best is trial 4 with value: -4.409888912338193.\n", - "[I 2025-01-20 16:21:55,034] Trial 16 finished with value: -266.2126635115298 and parameters: {'embedding_dim': 302, 'generator_dim': (128, 128), 'discriminator_dim': (128, 128), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 1}. Best is trial 4 with value: -4.409888912338193.\n", - "[I 2025-01-20 16:24:30,585] Trial 17 finished with value: -35.730265204691946 and parameters: {'embedding_dim': 426, 'generator_dim': (256, 256), 'discriminator_dim': (128, 128), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 4}. Best is trial 4 with value: -4.409888912338193.\n", - "[I 2025-01-20 16:26:33,459] Trial 18 finished with value: -108.18797777120533 and parameters: {'embedding_dim': 259, 'generator_dim': (256, 256), 'discriminator_dim': (256, 256), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 3}. Best is trial 4 with value: -4.409888912338193.\n", - "[I 2025-01-20 16:27:45,561] Trial 19 finished with value: -71.26357253219594 and parameters: {'embedding_dim': 329, 'generator_dim': (128, 128), 'discriminator_dim': (128, 128), 'pac': 8, 'batch_size': 1024, 'discriminator_steps': 3}. Best is trial 4 with value: -4.409888912338193.\n", - "[I 2025-01-20 16:29:15,652] Trial 20 finished with value: -1.3630811306533475 and parameters: {'embedding_dim': 402, 'generator_dim': (256, 256), 'discriminator_dim': (128, 128), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 2}. Best is trial 20 with value: -1.3630811306533475.\n", - "[I 2025-01-20 16:30:44,754] Trial 21 finished with value: -488.94996822363987 and parameters: {'embedding_dim': 402, 'generator_dim': (256, 256), 'discriminator_dim': (128, 128), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 2}. Best is trial 20 with value: -1.3630811306533475.\n", - "[I 2025-01-20 16:32:13,977] Trial 22 finished with value: -716.9087223543194 and parameters: {'embedding_dim': 380, 'generator_dim': (256, 256), 'discriminator_dim': (128, 128), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 2}. Best is trial 20 with value: -1.3630811306533475.\n", - "[I 2025-01-20 16:34:15,782] Trial 23 finished with value: -1312.349106866347 and parameters: {'embedding_dim': 323, 'generator_dim': (256, 256), 'discriminator_dim': (128, 128), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 3}. Best is trial 20 with value: -1.3630811306533475.\n", - "[I 2025-01-20 16:35:45,149] Trial 24 finished with value: -658.7045880157874 and parameters: {'embedding_dim': 425, 'generator_dim': (256, 256), 'discriminator_dim': (128, 128), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 2}. Best is trial 20 with value: -1.3630811306533475.\n", - "[I 2025-01-20 16:36:41,995] Trial 25 finished with value: -649.0407497590966 and parameters: {'embedding_dim': 358, 'generator_dim': (256, 256), 'discriminator_dim': (128, 128), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 1}. Best is trial 20 with value: -1.3630811306533475.\n", - "[I 2025-01-20 16:39:18,618] Trial 26 finished with value: -137.14788624945254 and parameters: {'embedding_dim': 428, 'generator_dim': (128, 128), 'discriminator_dim': (256, 256), 'pac': 4, 'batch_size': 512, 'discriminator_steps': 4}. Best is trial 20 with value: -1.3630811306533475.\n", - "[I 2025-01-20 16:41:23,169] Trial 27 finished with value: -11.215443967140468 and parameters: {'embedding_dim': 172, 'generator_dim': (256, 256), 'discriminator_dim': (128, 128), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 3}. Best is trial 20 with value: -1.3630811306533475.\n", - "[I 2025-01-20 16:44:54,394] Trial 28 finished with value: -70.15458903694731 and parameters: {'embedding_dim': 301, 'generator_dim': (128, 128), 'discriminator_dim': (128, 128), 'pac': 8, 'batch_size': 256, 'discriminator_steps': 3}. Best is trial 20 with value: -1.3630811306533475.\n", - "[I 2025-01-20 16:46:43,597] Trial 29 finished with value: -77.00986664642153 and parameters: {'embedding_dim': 269, 'generator_dim': (256, 256), 'discriminator_dim': (256, 256), 'pac': 4, 'batch_size': 1024, 'discriminator_steps': 5}. Best is trial 20 with value: -1.3630811306533475.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processed and saved: ../data/data_oversampled/ctgan_incheon.feather\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[I 2025-01-20 16:48:14,616] A new study created in memory with name: no-name-9673bd2f-608c-45e0-b8eb-39eeba0d0bbe\n", - "[I 2025-01-20 16:48:37,588] Trial 0 finished with value: -23.155852178206054 and parameters: {'embedding_dim': 72, 'generator_dim': (128, 128), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 3}. Best is trial 0 with value: -23.155852178206054.\n", - "[I 2025-01-20 16:48:42,118] Trial 1 finished with value: -34.21490168723894 and parameters: {'embedding_dim': 90, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 8, 'batch_size': 256, 'discriminator_steps': 1}. Best is trial 0 with value: -23.155852178206054.\n", - "[I 2025-01-20 16:49:05,052] Trial 2 finished with value: -4.003410149026237 and parameters: {'embedding_dim': 118, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 3}. Best is trial 2 with value: -4.003410149026237.\n", - "[I 2025-01-20 16:49:28,467] Trial 3 finished with value: -21.564028021550694 and parameters: {'embedding_dim': 106, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 8, 'batch_size': 64, 'discriminator_steps': 1}. Best is trial 2 with value: -4.003410149026237.\n", - "[I 2025-01-20 16:49:51,875] Trial 4 finished with value: -7.355484157065245 and parameters: {'embedding_dim': 112, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 8, 'batch_size': 64, 'discriminator_steps': 1}. Best is trial 2 with value: -4.003410149026237.\n", - "[I 2025-01-20 16:50:00,722] Trial 5 finished with value: -32.39901609359498 and parameters: {'embedding_dim': 74, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 8, 'batch_size': 256, 'discriminator_steps': 3}. Best is trial 2 with value: -4.003410149026237.\n", - "[I 2025-01-20 16:50:24,169] Trial 6 finished with value: -4.0750874978322935 and parameters: {'embedding_dim': 68, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 1}. Best is trial 2 with value: -4.003410149026237.\n", - "[I 2025-01-20 16:50:30,904] Trial 7 finished with value: -5.8933791686324675 and parameters: {'embedding_dim': 117, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 8, 'batch_size': 256, 'discriminator_steps': 2}. Best is trial 2 with value: -4.003410149026237.\n", - "[I 2025-01-20 16:50:37,571] Trial 8 finished with value: -15.325497040087624 and parameters: {'embedding_dim': 128, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 8, 'batch_size': 256, 'discriminator_steps': 2}. Best is trial 2 with value: -4.003410149026237.\n", - "[I 2025-01-20 16:50:48,499] Trial 9 finished with value: -16.06789263895181 and parameters: {'embedding_dim': 112, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 1}. Best is trial 2 with value: -4.003410149026237.\n", - "[I 2025-01-20 16:51:11,461] Trial 10 finished with value: -6.933575610142412 and parameters: {'embedding_dim': 92, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 3}. Best is trial 2 with value: -4.003410149026237.\n", - "[I 2025-01-20 16:51:48,391] Trial 11 finished with value: -4.1384946672555945 and parameters: {'embedding_dim': 64, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 2}. Best is trial 2 with value: -4.003410149026237.\n", - "[I 2025-01-20 16:52:38,821] Trial 12 finished with value: -9.416054623408765 and parameters: {'embedding_dim': 82, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 3}. Best is trial 2 with value: -4.003410149026237.\n", - "[I 2025-01-20 16:52:55,737] Trial 13 finished with value: -2.7694488886245567 and parameters: {'embedding_dim': 102, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 2}. Best is trial 13 with value: -2.7694488886245567.\n", - "[I 2025-01-20 16:53:12,663] Trial 14 finished with value: -8.615367995825054 and parameters: {'embedding_dim': 100, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 2}. Best is trial 13 with value: -2.7694488886245567.\n", - "[I 2025-01-20 16:53:29,586] Trial 15 finished with value: -6.889357108499393 and parameters: {'embedding_dim': 123, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 2}. Best is trial 13 with value: -2.7694488886245567.\n", - "[I 2025-01-20 16:53:52,531] Trial 16 finished with value: -9.171953998759633 and parameters: {'embedding_dim': 98, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 3}. Best is trial 13 with value: -2.7694488886245567.\n", - "[I 2025-01-20 16:54:09,458] Trial 17 finished with value: -57.35581165476122 and parameters: {'embedding_dim': 106, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 2}. Best is trial 13 with value: -2.7694488886245567.\n", - "[I 2025-01-20 16:54:26,358] Trial 18 finished with value: -8.11141919896362 and parameters: {'embedding_dim': 119, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 2}. Best is trial 13 with value: -2.7694488886245567.\n", - "[I 2025-01-20 16:54:49,299] Trial 19 finished with value: -36.89391231331456 and parameters: {'embedding_dim': 105, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 3}. Best is trial 13 with value: -2.7694488886245567.\n", - "[I 2025-01-20 16:55:12,263] Trial 20 finished with value: -0.915130278465809 and parameters: {'embedding_dim': 85, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 3}. Best is trial 20 with value: -0.915130278465809.\n", - "[I 2025-01-20 16:55:35,365] Trial 21 finished with value: -9.14515742082317 and parameters: {'embedding_dim': 88, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 3}. Best is trial 20 with value: -0.915130278465809.\n", - "[I 2025-01-20 16:55:58,368] Trial 22 finished with value: -11.94829599012277 and parameters: {'embedding_dim': 81, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 3}. Best is trial 20 with value: -0.915130278465809.\n", - "[I 2025-01-20 16:56:15,325] Trial 23 finished with value: -1.5616654362533082 and parameters: {'embedding_dim': 82, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 2}. Best is trial 20 with value: -0.915130278465809.\n", - "[I 2025-01-20 16:56:32,320] Trial 24 finished with value: -2.062209737409747 and parameters: {'embedding_dim': 82, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 2}. Best is trial 20 with value: -0.915130278465809.\n", - "[I 2025-01-20 16:56:49,278] Trial 25 finished with value: -2.528799284383497 and parameters: {'embedding_dim': 80, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 2}. Best is trial 20 with value: -0.915130278465809.\n", - "[I 2025-01-20 16:57:06,209] Trial 26 finished with value: -5.408362529903892 and parameters: {'embedding_dim': 76, 'generator_dim': (64, 64), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 2}. Best is trial 20 with value: -0.915130278465809.\n", - "[I 2025-01-20 16:57:23,136] Trial 27 finished with value: -8.895873144737894 and parameters: {'embedding_dim': 88, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 2}. Best is trial 20 with value: -0.915130278465809.\n", - "[I 2025-01-20 16:57:29,847] Trial 28 finished with value: -67.16752180682067 and parameters: {'embedding_dim': 85, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 256, 'discriminator_steps': 2}. Best is trial 20 with value: -0.915130278465809.\n", - "[I 2025-01-20 16:57:46,817] Trial 29 finished with value: -2.3990267779936456 and parameters: {'embedding_dim': 94, 'generator_dim': (128, 128), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 2}. Best is trial 20 with value: -0.915130278465809.\n", - "[I 2025-01-20 16:58:09,819] Trial 30 finished with value: -1.0609161149494997 and parameters: {'embedding_dim': 77, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 3}. Best is trial 20 with value: -0.915130278465809.\n", - "[I 2025-01-20 16:58:32,799] Trial 31 finished with value: -40.89668749736999 and parameters: {'embedding_dim': 76, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 3}. Best is trial 20 with value: -0.915130278465809.\n", - "[I 2025-01-20 16:58:55,758] Trial 32 finished with value: -20.543485177832817 and parameters: {'embedding_dim': 71, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 3}. Best is trial 20 with value: -0.915130278465809.\n", - "[I 2025-01-20 16:59:18,740] Trial 33 finished with value: -48.817409667795275 and parameters: {'embedding_dim': 85, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 3}. Best is trial 20 with value: -0.915130278465809.\n", - "[I 2025-01-20 16:59:41,793] Trial 34 finished with value: -8.609796785683871 and parameters: {'embedding_dim': 80, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 8, 'batch_size': 128, 'discriminator_steps': 3}. Best is trial 20 with value: -0.915130278465809.\n", - "[I 2025-01-20 16:59:52,805] Trial 35 finished with value: -0.9307100824718637 and parameters: {'embedding_dim': 70, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 1}. Best is trial 20 with value: -0.915130278465809.\n", - "[I 2025-01-20 17:00:03,767] Trial 36 finished with value: -18.096015461780212 and parameters: {'embedding_dim': 70, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 8, 'batch_size': 128, 'discriminator_steps': 1}. Best is trial 20 with value: -0.915130278465809.\n", - "[I 2025-01-20 17:00:08,288] Trial 37 finished with value: -47.453982612938844 and parameters: {'embedding_dim': 76, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 256, 'discriminator_steps': 1}. Best is trial 20 with value: -0.915130278465809.\n", - "[I 2025-01-20 17:00:31,733] Trial 38 finished with value: -317.9490947764603 and parameters: {'embedding_dim': 67, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 1}. Best is trial 20 with value: -0.915130278465809.\n", - "[I 2025-01-20 17:00:36,269] Trial 39 finished with value: -5.325056185063841 and parameters: {'embedding_dim': 74, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 8, 'batch_size': 256, 'discriminator_steps': 1}. Best is trial 20 with value: -0.915130278465809.\n", - "[I 2025-01-20 17:01:26,667] Trial 40 finished with value: -2.4975095593414944 and parameters: {'embedding_dim': 65, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 3}. Best is trial 20 with value: -0.915130278465809.\n", - "[I 2025-01-20 17:01:37,549] Trial 41 finished with value: -8.724928631283321 and parameters: {'embedding_dim': 85, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 1}. Best is trial 20 with value: -0.915130278465809.\n", - "[I 2025-01-20 17:01:54,575] Trial 42 finished with value: -9.957479554800914 and parameters: {'embedding_dim': 78, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 2}. Best is trial 20 with value: -0.915130278465809.\n", - "[I 2025-01-20 17:02:17,543] Trial 43 finished with value: -20.086840991896658 and parameters: {'embedding_dim': 90, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 3}. Best is trial 20 with value: -0.915130278465809.\n", - "[I 2025-01-20 17:02:28,489] Trial 44 finished with value: -123.24510747774687 and parameters: {'embedding_dim': 73, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 1}. Best is trial 20 with value: -0.915130278465809.\n", - "[I 2025-01-20 17:02:51,367] Trial 45 finished with value: -2.8224441691394726 and parameters: {'embedding_dim': 83, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 8, 'batch_size': 128, 'discriminator_steps': 3}. Best is trial 20 with value: -0.915130278465809.\n", - "[I 2025-01-20 17:03:08,278] Trial 46 finished with value: -61.541871320706946 and parameters: {'embedding_dim': 78, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 128, 'discriminator_steps': 2}. Best is trial 20 with value: -0.915130278465809.\n", - "[I 2025-01-20 17:03:45,236] Trial 47 finished with value: -44.6086713907501 and parameters: {'embedding_dim': 71, 'generator_dim': (64, 64), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 64, 'discriminator_steps': 2}. Best is trial 20 with value: -0.915130278465809.\n", - "[I 2025-01-20 17:03:49,767] Trial 48 finished with value: -18.76213014994069 and parameters: {'embedding_dim': 69, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 4, 'batch_size': 256, 'discriminator_steps': 1}. Best is trial 20 with value: -0.915130278465809.\n", - "[I 2025-01-20 17:04:06,677] Trial 49 finished with value: -14.235673820573563 and parameters: {'embedding_dim': 92, 'generator_dim': (128, 128), 'discriminator_dim': (64, 64), 'pac': 8, 'batch_size': 128, 'discriminator_steps': 2}. Best is trial 20 with value: -0.915130278465809.\n", - "[I 2025-01-20 17:04:29,881] A new study created in memory with name: no-name-cd965ffc-fc7d-429e-ae05-208f0c3fad33\n", - "[I 2025-01-20 17:06:32,714] Trial 0 finished with value: -522.274136185342 and parameters: {'embedding_dim': 225, 'generator_dim': (128, 128), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 256, 'discriminator_steps': 3}. Best is trial 0 with value: -522.274136185342.\n", - "[I 2025-01-20 17:07:20,042] Trial 1 finished with value: -586.1407911119707 and parameters: {'embedding_dim': 480, 'generator_dim': (128, 128), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 1024, 'discriminator_steps': 4}. Best is trial 0 with value: -522.274136185342.\n", - "[I 2025-01-20 17:08:50,110] Trial 2 finished with value: -1338.898800006008 and parameters: {'embedding_dim': 383, 'generator_dim': (128, 128), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 256, 'discriminator_steps': 2}. Best is trial 0 with value: -522.274136185342.\n", - "[I 2025-01-20 17:09:46,877] Trial 3 finished with value: -595.5573560928374 and parameters: {'embedding_dim': 441, 'generator_dim': (128, 128), 'discriminator_dim': (256, 256), 'pac': 8, 'batch_size': 1024, 'discriminator_steps': 5}. Best is trial 0 with value: -522.274136185342.\n", - "[I 2025-01-20 17:10:15,266] Trial 4 finished with value: -186.13472727208838 and parameters: {'embedding_dim': 186, 'generator_dim': (256, 256), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 1024, 'discriminator_steps': 2}. Best is trial 4 with value: -186.13472727208838.\n", - "[I 2025-01-20 17:11:59,408] Trial 5 finished with value: -556.572362981219 and parameters: {'embedding_dim': 379, 'generator_dim': (128, 128), 'discriminator_dim': (256, 256), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 5}. Best is trial 4 with value: -186.13472727208838.\n", - "[I 2025-01-20 17:13:07,775] Trial 6 finished with value: -30.304752691466533 and parameters: {'embedding_dim': 214, 'generator_dim': (256, 256), 'discriminator_dim': (256, 256), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 3}. Best is trial 6 with value: -30.304752691466533.\n", - "[I 2025-01-20 17:14:32,975] Trial 7 finished with value: -31.549366617667648 and parameters: {'embedding_dim': 388, 'generator_dim': (128, 128), 'discriminator_dim': (256, 256), 'pac': 4, 'batch_size': 512, 'discriminator_steps': 4}. Best is trial 6 with value: -30.304752691466533.\n", - "[I 2025-01-20 17:15:12,970] Trial 8 finished with value: -345.73649432175523 and parameters: {'embedding_dim': 409, 'generator_dim': (128, 128), 'discriminator_dim': (128, 128), 'pac': 8, 'batch_size': 1024, 'discriminator_steps': 3}. Best is trial 6 with value: -30.304752691466533.\n", - "[I 2025-01-20 17:15:45,595] Trial 9 finished with value: -1021.6740667962498 and parameters: {'embedding_dim': 451, 'generator_dim': (128, 128), 'discriminator_dim': (256, 256), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 1}. Best is trial 6 with value: -30.304752691466533.\n", - "[I 2025-01-20 17:16:18,456] Trial 10 finished with value: -0.6514198871412 and parameters: {'embedding_dim': 270, 'generator_dim': (256, 256), 'discriminator_dim': (256, 256), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 1}. Best is trial 10 with value: -0.6514198871412.\n", - "[I 2025-01-20 17:16:51,307] Trial 11 finished with value: -4.8963478843156665 and parameters: {'embedding_dim': 277, 'generator_dim': (256, 256), 'discriminator_dim': (256, 256), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 1}. Best is trial 10 with value: -0.6514198871412.\n", - "[I 2025-01-20 17:17:24,454] Trial 12 finished with value: -153.22796082787534 and parameters: {'embedding_dim': 289, 'generator_dim': (256, 256), 'discriminator_dim': (256, 256), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 1}. Best is trial 10 with value: -0.6514198871412.\n", - "[I 2025-01-20 17:17:56,888] Trial 13 finished with value: -199.5670473579735 and parameters: {'embedding_dim': 299, 'generator_dim': (256, 256), 'discriminator_dim': (256, 256), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 1}. Best is trial 10 with value: -0.6514198871412.\n", - "[I 2025-01-20 17:18:47,371] Trial 14 finished with value: -140.37597916237385 and parameters: {'embedding_dim': 134, 'generator_dim': (256, 256), 'discriminator_dim': (256, 256), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 2}. Best is trial 10 with value: -0.6514198871412.\n", - "[I 2025-01-20 17:19:20,921] Trial 15 finished with value: -3561.1724538614976 and parameters: {'embedding_dim': 261, 'generator_dim': (256, 256), 'discriminator_dim': (256, 256), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 1}. Best is trial 10 with value: -0.6514198871412.\n", - "[I 2025-01-20 17:20:51,379] Trial 16 finished with value: -547.4278435153543 and parameters: {'embedding_dim': 334, 'generator_dim': (256, 256), 'discriminator_dim': (256, 256), 'pac': 8, 'batch_size': 256, 'discriminator_steps': 2}. Best is trial 10 with value: -0.6514198871412.\n", - "[I 2025-01-20 17:21:24,230] Trial 17 finished with value: -1821.0058166894948 and parameters: {'embedding_dim': 341, 'generator_dim': (256, 256), 'discriminator_dim': (256, 256), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 1}. Best is trial 10 with value: -0.6514198871412.\n", - "[I 2025-01-20 17:22:14,676] Trial 18 finished with value: -21.687672514288167 and parameters: {'embedding_dim': 253, 'generator_dim': (256, 256), 'discriminator_dim': (256, 256), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 2}. Best is trial 10 with value: -0.6514198871412.\n", - "[I 2025-01-20 17:23:12,549] Trial 19 finished with value: -110.57179286531195 and parameters: {'embedding_dim': 161, 'generator_dim': (256, 256), 'discriminator_dim': (256, 256), 'pac': 8, 'batch_size': 256, 'discriminator_steps': 1}. Best is trial 10 with value: -0.6514198871412.\n", - "[I 2025-01-20 17:24:39,667] Trial 20 finished with value: -880.2718942270703 and parameters: {'embedding_dim': 259, 'generator_dim': (256, 256), 'discriminator_dim': (256, 256), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 4}. Best is trial 10 with value: -0.6514198871412.\n", - "[I 2025-01-20 17:25:29,860] Trial 21 finished with value: -15.677402299300798 and parameters: {'embedding_dim': 247, 'generator_dim': (256, 256), 'discriminator_dim': (256, 256), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 2}. Best is trial 10 with value: -0.6514198871412.\n", - "[I 2025-01-20 17:26:20,084] Trial 22 finished with value: -443.87283814186236 and parameters: {'embedding_dim': 303, 'generator_dim': (256, 256), 'discriminator_dim': (256, 256), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 2}. Best is trial 10 with value: -0.6514198871412.\n", - "[I 2025-01-20 17:26:52,252] Trial 23 finished with value: -1813.9072516488602 and parameters: {'embedding_dim': 216, 'generator_dim': (256, 256), 'discriminator_dim': (256, 256), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 1}. Best is trial 10 with value: -0.6514198871412.\n", - "[I 2025-01-20 17:27:42,365] Trial 24 finished with value: -3066.9405889616755 and parameters: {'embedding_dim': 273, 'generator_dim': (256, 256), 'discriminator_dim': (256, 256), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 2}. Best is trial 10 with value: -0.6514198871412.\n", - "[I 2025-01-20 17:28:15,491] Trial 25 finished with value: -49.0184152543112 and parameters: {'embedding_dim': 343, 'generator_dim': (256, 256), 'discriminator_dim': (256, 256), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 1}. Best is trial 10 with value: -0.6514198871412.\n", - "[I 2025-01-20 17:29:06,215] Trial 26 finished with value: -1436.6956066386338 and parameters: {'embedding_dim': 233, 'generator_dim': (256, 256), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 512, 'discriminator_steps': 2}. Best is trial 10 with value: -0.6514198871412.\n", - "[I 2025-01-20 17:29:39,298] Trial 27 finished with value: -1622.8851329380343 and parameters: {'embedding_dim': 187, 'generator_dim': (256, 256), 'discriminator_dim': (256, 256), 'pac': 8, 'batch_size': 512, 'discriminator_steps': 1}. Best is trial 10 with value: -0.6514198871412.\n", - "[I 2025-01-20 17:31:10,397] Trial 28 finished with value: -610.6098245576701 and parameters: {'embedding_dim': 318, 'generator_dim': (256, 256), 'discriminator_dim': (256, 256), 'pac': 8, 'batch_size': 256, 'discriminator_steps': 2}. Best is trial 10 with value: -0.6514198871412.\n", - "[I 2025-01-20 17:31:48,784] Trial 29 finished with value: -227.41157626188772 and parameters: {'embedding_dim': 233, 'generator_dim': (256, 256), 'discriminator_dim': (128, 128), 'pac': 4, 'batch_size': 1024, 'discriminator_steps': 3}. Best is trial 10 with value: -0.6514198871412.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processed and saved: ../data/data_oversampled/ctgan_seoul.feather\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "from imblearn.over_sampling import SMOTENC\n", - "import optuna\n", - "from ctgan import CTGAN\n", - "import torch\n", - "import warnings\n", - "\n", - "# 지역별 데이터 파일 경로\n", - "regions = ['busan', 'daegu', 'daejeon', 'gwangju', 'incheon', 'seoul']\n", - "file_paths = [f'../data/data_for_modeling/{region}_train.csv' for region in regions]\n", - "output_paths = [f'../data/data_oversampled/ctgan20000_{region}.csv' for region in regions]\n", - "\n", - "# GPU 사용 설정\n", - "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", - "print(f\"Using device: {device}\")\n", - "\n", - "# 경고 무시\n", - "warnings.filterwarnings(\"ignore\", category=UserWarning, module=\"optuna.distributions\")\n", - "\n", - "# 지역별 처리\n", - "for file_path, output_path in zip(file_paths, output_paths):\n", - " # 데이터 로드\n", - " data = pd.read_csv(file_path, index_col=0)\n", - " X = data.drop(columns=['multi_class', 'binary_class'])\n", - " y = data['multi_class']\n", - "\n", - " # 불필요한 열 제거\n", - " X.drop(columns=['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos'], inplace=True)\n", - "\n", - " # SMOTENC에서 사용할 범주형 변수 열 번호 설정\n", - " categorical_features_indices = [i for i, dtype in enumerate(X.dtypes) if dtype != 'float64']\n", - "\n", - " # sampling_strategy 설정\n", - " count_class_0 = (y == 0).sum()\n", - " count_class_1 = (y == 1).sum()\n", - " count_class_2 = (y == 2).sum()\n", - " sampling_strategy = {\n", - " 0: 500 if count_class_0 <= 500 else 1000,\n", - " 1: int(np.ceil(count_class_1 / 100) * 100), # 백의 자리로 올림\n", - " 2: count_class_2\n", - " }\n", - "\n", - " # SMOTENC 적용\n", - " smotenc = SMOTENC(categorical_features=categorical_features_indices, sampling_strategy=sampling_strategy, random_state=42)\n", - " X_resampled, y_resampled = smotenc.fit_resample(X, y)\n", - "\n", - " # Resampled 데이터 생성\n", - " lerp_data = X_resampled.copy()\n", - " lerp_data['multi_class'] = y_resampled\n", - "\n", - " # CTGAN에서 사용할 범주형 변수 열 이름 설정\n", - " categorical_features = [\n", - " col for col, dtype in zip(lerp_data.columns, lerp_data.dtypes) if dtype != 'float64'\n", - " ]\n", - "\n", - " # Optuna 목적 함수 정의\n", - " def objective(trial):\n", - " # 하이퍼파라미터 탐색 범위 설정\n", - " embedding_dim = trial.suggest_int(\"embedding_dim\", 64, 128)\n", - " generator_dim = trial.suggest_categorical(\"generator_dim\", [(64, 64), (128, 128)])\n", - " discriminator_dim = trial.suggest_categorical(\"discriminator_dim\", [(64, 64), (128, 128)])\n", - " pac = trial.suggest_categorical(\"pac\", [4, 8])\n", - " batch_size = trial.suggest_categorical(\"batch_size\", [64, 128, 256])\n", - " discriminator_steps = trial.suggest_int(\"discriminator_steps\", 1, 3)\n", - "\n", - " # CTGAN 모델 생성\n", - " ctgan = CTGAN(\n", - " embedding_dim=embedding_dim,\n", - " generator_dim=generator_dim,\n", - " discriminator_dim=discriminator_dim,\n", - " batch_size=batch_size,\n", - " discriminator_steps=discriminator_steps,\n", - " pac=pac\n", - " )\n", - "\n", - " # 범주 0 데이터 필터링\n", - " data_0 = lerp_data[lerp_data['multi_class'] == 0]\n", - "\n", - " # 모델 학습\n", - " ctgan.fit(data_0, discrete_columns=categorical_features)\n", - "\n", - " # 샘플 생성\n", - " generated_data = ctgan.sample(len(data_0) * 2)\n", - "\n", - " # 평가: 샘플의 연속형 변수 분포 비교\n", - " real_visi = data_0['visi']\n", - " generated_visi = generated_data['visi']\n", - "\n", - " # 분포 간 차이(MSE) 계산\n", - " mse = ((real_visi.mean() - generated_visi.mean())**2 + (real_visi.std() - generated_visi.std())**2)\n", - " return -mse\n", - "\n", - " # Optuna로 최적화 수행\n", - " study = optuna.create_study(direction=\"maximize\")\n", - " study.optimize(objective, n_trials=50)\n", - "\n", - " # 최적 하이퍼파라미터 출력\n", - " best_params = study.best_params\n", - "\n", - " # 최적 하이퍼파라미터로 CTGAN 학습 및 샘플 생성\n", - " ctgan = CTGAN(\n", - " embedding_dim=best_params[\"embedding_dim\"],\n", - " generator_dim=best_params[\"generator_dim\"],\n", - " discriminator_dim=best_params[\"discriminator_dim\"],\n", - " batch_size=best_params[\"batch_size\"],\n", - " discriminator_steps=best_params[\"discriminator_steps\"],\n", - " pac=best_params[\"pac\"]\n", - " )\n", - "\n", - " # 범주 0 데이터로 최종 학습\n", - " ctgan.fit(lerp_data[lerp_data['multi_class'] == 0], discrete_columns=categorical_features)\n", - " generated_0 = ctgan.sample(20000)\n", - "\n", - " # 범주 1 데이터 최적화 및 생성\n", - " def objective_class1(trial):\n", - " embedding_dim = trial.suggest_int(\"embedding_dim\", 128, 512)\n", - " generator_dim = trial.suggest_categorical(\"generator_dim\", [(128, 128), (256, 256)])\n", - " discriminator_dim = trial.suggest_categorical(\"discriminator_dim\", [(128, 128), (256, 256)])\n", - " pac = trial.suggest_categorical(\"pac\", [4, 8])\n", - " batch_size = trial.suggest_categorical(\"batch_size\", [256, 512, 1024])\n", - " discriminator_steps = trial.suggest_int(\"discriminator_steps\", 1, 5)\n", - "\n", - " ctgan = CTGAN(\n", - " embedding_dim=embedding_dim,\n", - " generator_dim=generator_dim,\n", - " discriminator_dim=discriminator_dim,\n", - " batch_size=batch_size,\n", - " discriminator_steps=discriminator_steps,\n", - " pac=pac\n", - " )\n", - "\n", - " data_1 = lerp_data[lerp_data['multi_class'] == 1]\n", - " ctgan.fit(data_1, discrete_columns=categorical_features)\n", - " generated_data = ctgan.sample(len(data_1) * 2)\n", - "\n", - " real_visi = data_1['visi']\n", - " generated_visi = generated_data['visi']\n", - " mse = ((real_visi.mean() - generated_visi.mean())**2 + (real_visi.std() - generated_visi.std())**2)\n", - " return -mse\n", - "\n", - " study_class1 = optuna.create_study(direction=\"maximize\")\n", - " study_class1.optimize(objective_class1, n_trials=30)\n", - "\n", - " best_params_class1 = study_class1.best_params\n", - " ctgan = CTGAN(\n", - " embedding_dim=best_params_class1[\"embedding_dim\"],\n", - " generator_dim=best_params_class1[\"generator_dim\"],\n", - " discriminator_dim=best_params_class1[\"discriminator_dim\"],\n", - " batch_size=best_params_class1[\"batch_size\"],\n", - " discriminator_steps=best_params_class1[\"discriminator_steps\"],\n", - " pac=best_params_class1[\"pac\"]\n", - " )\n", - "\n", - " ctgan.fit(lerp_data[lerp_data['multi_class'] == 1], discrete_columns=categorical_features)\n", - " generated_1 = ctgan.sample(20000 - int(np.ceil(count_class_1 / 100) * 100))\n", - "\n", - " # 데이터 병합 및 저장\n", - " well_generated0 = generated_0[(generated_0['visi'] >= 0) & (generated_0['visi'] < 100)]\n", - " well_generated1 = generated_1[(generated_1['visi'] >= 100) & (generated_1['visi'] < 500)]\n", - " smote_gan_data = pd.concat([lerp_data, well_generated0, well_generated1], axis=0)\n", - " # 제거변수 복구\n", - " smote_gan_data['binary_class'] = smote_gan_data['multi_class'].apply(lambda x: 0 if x == 2 else 1)\n", - " smote_gan_data['hour_sin'] = np.sin(2 * np.pi * smote_gan_data['hour'] / 24)\n", - " smote_gan_data['hour_cos'] = np.cos(2 * np.pi * smote_gan_data['hour'] / 24)\n", - " smote_gan_data['month_sin'] = np.sin(2 * np.pi * smote_gan_data['month'] / 12)\n", - " smote_gan_data['month_cos'] = np.cos(2 * np.pi * smote_gan_data['month'] / 12)\n", - " smote_gan_data['ground_temp - temp_C'] = smote_gan_data['groundtemp'] - smote_gan_data['temp_C']\n", - "\n", - " filtered_data = smote_gan_data[smote_gan_data['multi_class'] != 2]\n", - " original_class2 = data[data['multi_class'] == 2]\n", - " final_data = pd.concat([filtered_data, original_class2], axis=0)\n", - " final_data.reset_index(drop=True, inplace=True)\n", - "\n", - " # 결과 저장\n", - " final_data.to_csv(output_path, index = False)\n", - " print(len(final_data[final_data['multi_class']==0]),'|',len(final_data[final_data['multi_class']==1]),'|',len(final_data[final_data['multi_class']==2]))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 소수범주당 증강 목표 10000개" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/root/miniconda3/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Using device: cuda\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[I 2025-01-22 02:07:50,408] A new study created in memory with name: no-name-f35c7b28-3dae-4b56-9eb2-6e054e9a2682\n", - "/root/miniconda3/lib/python3.12/site-packages/torch/autograd/graph.py:825: UserWarning: Attempting to run cuBLAS, but there was no current CUDA context! Attempting to set the primary context... (Triggered internally at ../aten/src/ATen/cuda/CublasHandlePool.cpp:135.)\n", - " return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass\n", - "[W 2025-01-22 02:07:55,487] Trial 0 failed with parameters: {'embedding_dim': 96, 'generator_dim': (64, 64), 'discriminator_dim': (64, 64), 'pac': 8, 'batch_size': 64, 'discriminator_steps': 2} because of the following error: KeyboardInterrupt().\n", - "Traceback (most recent call last):\n", - " File \"/root/miniconda3/lib/python3.12/site-packages/optuna/study/_optimize.py\", line 197, in _run_trial\n", - " value_or_values = func(trial)\n", - " ^^^^^^^^^^^\n", - " File \"/tmp/ipykernel_386644/1364079751.py\", line 81, in objective\n", - " ctgan.fit(data_0, discrete_columns=categorical_features)\n", - " File \"/root/miniconda3/lib/python3.12/site-packages/ctgan/synthesizers/base.py\", line 50, in wrapper\n", - " return function(self, *args, **kwargs)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/root/miniconda3/lib/python3.12/site-packages/ctgan/synthesizers/ctgan.py\", line 408, in fit\n", - " loss_d.backward()\n", - " File \"/root/miniconda3/lib/python3.12/site-packages/torch/_tensor.py\", line 581, in backward\n", - " torch.autograd.backward(\n", - " File \"/root/miniconda3/lib/python3.12/site-packages/torch/autograd/__init__.py\", line 347, in backward\n", - " _engine_run_backward(\n", - " File \"/root/miniconda3/lib/python3.12/site-packages/torch/autograd/graph.py\", line 825, in _engine_run_backward\n", - " return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - "KeyboardInterrupt\n", - "[W 2025-01-22 02:07:55,488] Trial 0 failed with value None.\n" - ] - }, - { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[1], line 96\u001b[0m\n\u001b[1;32m 94\u001b[0m \u001b[38;5;66;03m# Optuna로 최적화 수행\u001b[39;00m\n\u001b[1;32m 95\u001b[0m study \u001b[38;5;241m=\u001b[39m optuna\u001b[38;5;241m.\u001b[39mcreate_study(direction\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmaximize\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 96\u001b[0m \u001b[43mstudy\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptimize\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobjective\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mn_trials\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m50\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 98\u001b[0m \u001b[38;5;66;03m# 최적 하이퍼파라미터 출력\u001b[39;00m\n\u001b[1;32m 99\u001b[0m best_params \u001b[38;5;241m=\u001b[39m study\u001b[38;5;241m.\u001b[39mbest_params\n", - "File \u001b[0;32m~/miniconda3/lib/python3.12/site-packages/optuna/study/study.py:475\u001b[0m, in \u001b[0;36mStudy.optimize\u001b[0;34m(self, func, n_trials, timeout, n_jobs, catch, callbacks, gc_after_trial, show_progress_bar)\u001b[0m\n\u001b[1;32m 373\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21moptimize\u001b[39m(\n\u001b[1;32m 374\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 375\u001b[0m func: ObjectiveFuncType,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 382\u001b[0m show_progress_bar: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m 383\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 384\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Optimize an objective function.\u001b[39;00m\n\u001b[1;32m 385\u001b[0m \n\u001b[1;32m 386\u001b[0m \u001b[38;5;124;03m Optimization is done by choosing a suitable set of hyperparameter values from a given\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 473\u001b[0m \u001b[38;5;124;03m If nested invocation of this method occurs.\u001b[39;00m\n\u001b[1;32m 474\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 475\u001b[0m \u001b[43m_optimize\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 476\u001b[0m \u001b[43m \u001b[49m\u001b[43mstudy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 477\u001b[0m \u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 478\u001b[0m \u001b[43m \u001b[49m\u001b[43mn_trials\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mn_trials\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 479\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 480\u001b[0m \u001b[43m \u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mn_jobs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 481\u001b[0m \u001b[43m \u001b[49m\u001b[43mcatch\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mtuple\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mcatch\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43misinstance\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mcatch\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mIterable\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[43mcatch\u001b[49m\u001b[43m,\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 482\u001b[0m \u001b[43m \u001b[49m\u001b[43mcallbacks\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcallbacks\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 483\u001b[0m \u001b[43m \u001b[49m\u001b[43mgc_after_trial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgc_after_trial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 484\u001b[0m \u001b[43m \u001b[49m\u001b[43mshow_progress_bar\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mshow_progress_bar\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 485\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/miniconda3/lib/python3.12/site-packages/optuna/study/_optimize.py:63\u001b[0m, in \u001b[0;36m_optimize\u001b[0;34m(study, func, n_trials, timeout, n_jobs, catch, callbacks, gc_after_trial, show_progress_bar)\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 62\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m n_jobs \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[0;32m---> 63\u001b[0m \u001b[43m_optimize_sequential\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 64\u001b[0m \u001b[43m \u001b[49m\u001b[43mstudy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 65\u001b[0m \u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 66\u001b[0m \u001b[43m \u001b[49m\u001b[43mn_trials\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 67\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 68\u001b[0m \u001b[43m \u001b[49m\u001b[43mcatch\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 69\u001b[0m \u001b[43m \u001b[49m\u001b[43mcallbacks\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 70\u001b[0m \u001b[43m \u001b[49m\u001b[43mgc_after_trial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 71\u001b[0m \u001b[43m \u001b[49m\u001b[43mreseed_sampler_rng\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 72\u001b[0m \u001b[43m \u001b[49m\u001b[43mtime_start\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 73\u001b[0m \u001b[43m \u001b[49m\u001b[43mprogress_bar\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mprogress_bar\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 74\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 75\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 76\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m n_jobs \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m:\n", - "File \u001b[0;32m~/miniconda3/lib/python3.12/site-packages/optuna/study/_optimize.py:160\u001b[0m, in \u001b[0;36m_optimize_sequential\u001b[0;34m(study, func, n_trials, timeout, catch, callbacks, gc_after_trial, reseed_sampler_rng, time_start, progress_bar)\u001b[0m\n\u001b[1;32m 157\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m 159\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 160\u001b[0m frozen_trial \u001b[38;5;241m=\u001b[39m \u001b[43m_run_trial\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstudy\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcatch\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 161\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 162\u001b[0m \u001b[38;5;66;03m# The following line mitigates memory problems that can be occurred in some\u001b[39;00m\n\u001b[1;32m 163\u001b[0m \u001b[38;5;66;03m# environments (e.g., services that use computing containers such as GitHub Actions).\u001b[39;00m\n\u001b[1;32m 164\u001b[0m \u001b[38;5;66;03m# Please refer to the following PR for further details:\u001b[39;00m\n\u001b[1;32m 165\u001b[0m \u001b[38;5;66;03m# https://github.com/optuna/optuna/pull/325.\u001b[39;00m\n\u001b[1;32m 166\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m gc_after_trial:\n", - "File \u001b[0;32m~/miniconda3/lib/python3.12/site-packages/optuna/study/_optimize.py:248\u001b[0m, in \u001b[0;36m_run_trial\u001b[0;34m(study, func, catch)\u001b[0m\n\u001b[1;32m 241\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mShould not reach.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 243\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 244\u001b[0m frozen_trial\u001b[38;5;241m.\u001b[39mstate \u001b[38;5;241m==\u001b[39m TrialState\u001b[38;5;241m.\u001b[39mFAIL\n\u001b[1;32m 245\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m func_err \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 246\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(func_err, catch)\n\u001b[1;32m 247\u001b[0m ):\n\u001b[0;32m--> 248\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m func_err\n\u001b[1;32m 249\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m frozen_trial\n", - "File \u001b[0;32m~/miniconda3/lib/python3.12/site-packages/optuna/study/_optimize.py:197\u001b[0m, in \u001b[0;36m_run_trial\u001b[0;34m(study, func, catch)\u001b[0m\n\u001b[1;32m 195\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m get_heartbeat_thread(trial\u001b[38;5;241m.\u001b[39m_trial_id, study\u001b[38;5;241m.\u001b[39m_storage):\n\u001b[1;32m 196\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 197\u001b[0m value_or_values \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtrial\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 198\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m exceptions\u001b[38;5;241m.\u001b[39mTrialPruned \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 199\u001b[0m \u001b[38;5;66;03m# TODO(mamu): Handle multi-objective cases.\u001b[39;00m\n\u001b[1;32m 200\u001b[0m state \u001b[38;5;241m=\u001b[39m TrialState\u001b[38;5;241m.\u001b[39mPRUNED\n", - "Cell \u001b[0;32mIn[1], line 81\u001b[0m, in \u001b[0;36mobjective\u001b[0;34m(trial)\u001b[0m\n\u001b[1;32m 78\u001b[0m data_0 \u001b[38;5;241m=\u001b[39m lerp_data[lerp_data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmulti_class\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 80\u001b[0m \u001b[38;5;66;03m# 모델 학습\u001b[39;00m\n\u001b[0;32m---> 81\u001b[0m \u001b[43mctgan\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata_0\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdiscrete_columns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcategorical_features\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 83\u001b[0m \u001b[38;5;66;03m# 샘플 생성\u001b[39;00m\n\u001b[1;32m 84\u001b[0m generated_data \u001b[38;5;241m=\u001b[39m ctgan\u001b[38;5;241m.\u001b[39msample(\u001b[38;5;28mlen\u001b[39m(data_0) \u001b[38;5;241m*\u001b[39m \u001b[38;5;241m2\u001b[39m)\n", - "File \u001b[0;32m~/miniconda3/lib/python3.12/site-packages/ctgan/synthesizers/base.py:50\u001b[0m, in \u001b[0;36mrandom_state..wrapper\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 48\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrapper\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 49\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrandom_states \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m---> 50\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunction\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 53\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m set_random_states(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrandom_states, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mset_random_state):\n", - "File \u001b[0;32m~/miniconda3/lib/python3.12/site-packages/ctgan/synthesizers/ctgan.py:408\u001b[0m, in \u001b[0;36mCTGAN.fit\u001b[0;34m(self, train_data, discrete_columns, epochs)\u001b[0m\n\u001b[1;32m 406\u001b[0m optimizerD\u001b[38;5;241m.\u001b[39mzero_grad(set_to_none\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m 407\u001b[0m pen\u001b[38;5;241m.\u001b[39mbackward(retain_graph\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m--> 408\u001b[0m \u001b[43mloss_d\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 409\u001b[0m optimizerD\u001b[38;5;241m.\u001b[39mstep()\n\u001b[1;32m 411\u001b[0m fakez \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mnormal(mean\u001b[38;5;241m=\u001b[39mmean, std\u001b[38;5;241m=\u001b[39mstd)\n", - "File \u001b[0;32m~/miniconda3/lib/python3.12/site-packages/torch/_tensor.py:581\u001b[0m, in \u001b[0;36mTensor.backward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m 571\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 572\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(\n\u001b[1;32m 573\u001b[0m Tensor\u001b[38;5;241m.\u001b[39mbackward,\n\u001b[1;32m 574\u001b[0m (\u001b[38;5;28mself\u001b[39m,),\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 579\u001b[0m inputs\u001b[38;5;241m=\u001b[39minputs,\n\u001b[1;32m 580\u001b[0m )\n\u001b[0;32m--> 581\u001b[0m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mautograd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 582\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs\u001b[49m\n\u001b[1;32m 583\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/miniconda3/lib/python3.12/site-packages/torch/autograd/__init__.py:347\u001b[0m, in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m 342\u001b[0m retain_graph \u001b[38;5;241m=\u001b[39m create_graph\n\u001b[1;32m 344\u001b[0m \u001b[38;5;66;03m# The reason we repeat the same comment below is that\u001b[39;00m\n\u001b[1;32m 345\u001b[0m \u001b[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001b[39;00m\n\u001b[1;32m 346\u001b[0m \u001b[38;5;66;03m# calls in the traceback and some print out the last line\u001b[39;00m\n\u001b[0;32m--> 347\u001b[0m \u001b[43m_engine_run_backward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 348\u001b[0m \u001b[43m \u001b[49m\u001b[43mtensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 349\u001b[0m \u001b[43m \u001b[49m\u001b[43mgrad_tensors_\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 350\u001b[0m \u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 351\u001b[0m \u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 352\u001b[0m \u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 353\u001b[0m \u001b[43m \u001b[49m\u001b[43mallow_unreachable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 354\u001b[0m \u001b[43m \u001b[49m\u001b[43maccumulate_grad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 355\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/miniconda3/lib/python3.12/site-packages/torch/autograd/graph.py:825\u001b[0m, in \u001b[0;36m_engine_run_backward\u001b[0;34m(t_outputs, *args, **kwargs)\u001b[0m\n\u001b[1;32m 823\u001b[0m unregister_hooks \u001b[38;5;241m=\u001b[39m _register_logging_hooks_on_whole_graph(t_outputs)\n\u001b[1;32m 824\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 825\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mVariable\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execution_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_backward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Calls into the C++ engine to run the backward pass\u001b[39;49;00m\n\u001b[1;32m 826\u001b[0m \u001b[43m \u001b[49m\u001b[43mt_outputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\n\u001b[1;32m 827\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# Calls into the C++ engine to run the backward pass\u001b[39;00m\n\u001b[1;32m 828\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 829\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m attach_logging_hooks:\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " - ] - } - ], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "from imblearn.over_sampling import SMOTENC\n", - "import optuna\n", - "from ctgan import CTGAN\n", - "import torch\n", - "import warnings\n", - "\n", - "# 지역별 데이터 파일 경로\n", - "regions = ['busan', 'daegu', 'daejeon', 'gwangju', 'incheon', 'seoul']\n", - "file_paths = [f'../data/data_for_modeling/{region}_train.csv' for region in regions]\n", - "output_paths = [f'../data/data_oversampled/ctgan10000_{region}.csv' for region in regions]\n", - "\n", - "# GPU 사용 설정\n", - "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", - "print(f\"Using device: {device}\")\n", - "\n", - "# 경고 무시\n", - "warnings.filterwarnings(\"ignore\", category=UserWarning, module=\"optuna.distributions\")\n", - "\n", - "# 지역별 처리\n", - "for file_path, output_path in zip(file_paths, output_paths):\n", - " # 데이터 로드\n", - " data = pd.read_csv(file_path, index_col=0)\n", - " X = data.drop(columns=['multi_class', 'binary_class'])\n", - " y = data['multi_class']\n", - "\n", - " # 불필요한 열 제거\n", - " X.drop(columns=['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos'], inplace=True)\n", - "\n", - " # SMOTENC에서 사용할 범주형 변수 열 번호 설정\n", - " categorical_features_indices = [i for i, dtype in enumerate(X.dtypes) if dtype != 'float64']\n", - "\n", - " # sampling_strategy 설정\n", - " count_class_0 = (y == 0).sum()\n", - " count_class_1 = (y == 1).sum()\n", - " count_class_2 = (y == 2).sum()\n", - " sampling_strategy = {\n", - " 0: 500 if count_class_0 <= 500 else 1000,\n", - " 1: int(np.ceil(count_class_1 / 100) * 100), # 백의 자리로 올림\n", - " 2: count_class_2\n", - " }\n", - "\n", - " # SMOTENC 적용\n", - " smotenc = SMOTENC(categorical_features=categorical_features_indices, sampling_strategy=sampling_strategy, random_state=42)\n", - " X_resampled, y_resampled = smotenc.fit_resample(X, y)\n", - "\n", - " # Resampled 데이터 생성\n", - " lerp_data = X_resampled.copy()\n", - " lerp_data['multi_class'] = y_resampled\n", - "\n", - " # CTGAN에서 사용할 범주형 변수 열 이름 설정\n", - " categorical_features = [\n", - " col for col, dtype in zip(lerp_data.columns, lerp_data.dtypes) if dtype != 'float64'\n", - " ]\n", - "\n", - " # Optuna 목적 함수 정의\n", - " def objective(trial):\n", - " # 하이퍼파라미터 탐색 범위 설정\n", - " embedding_dim = trial.suggest_int(\"embedding_dim\", 64, 128)\n", - " generator_dim = trial.suggest_categorical(\"generator_dim\", [(64, 64), (128, 128)])\n", - " discriminator_dim = trial.suggest_categorical(\"discriminator_dim\", [(64, 64), (128, 128)])\n", - " pac = trial.suggest_categorical(\"pac\", [4, 8])\n", - " batch_size = trial.suggest_categorical(\"batch_size\", [64, 128, 256])\n", - " discriminator_steps = trial.suggest_int(\"discriminator_steps\", 1, 3)\n", - "\n", - " # CTGAN 모델 생성\n", - " ctgan = CTGAN(\n", - " embedding_dim=embedding_dim,\n", - " generator_dim=generator_dim,\n", - " discriminator_dim=discriminator_dim,\n", - " batch_size=batch_size,\n", - " discriminator_steps=discriminator_steps,\n", - " pac=pac\n", - " )\n", - "\n", - " # 범주 0 데이터 필터링\n", - " data_0 = lerp_data[lerp_data['multi_class'] == 0]\n", - "\n", - " # 모델 학습\n", - " ctgan.fit(data_0, discrete_columns=categorical_features)\n", - "\n", - " # 샘플 생성\n", - " generated_data = ctgan.sample(len(data_0) * 2)\n", - "\n", - " # 평가: 샘플의 연속형 변수 분포 비교\n", - " real_visi = data_0['visi']\n", - " generated_visi = generated_data['visi']\n", - "\n", - " # 분포 간 차이(MSE) 계산\n", - " mse = ((real_visi.mean() - generated_visi.mean())**2 + (real_visi.std() - generated_visi.std())**2)\n", - " return -mse\n", - "\n", - " # Optuna로 최적화 수행\n", - " study = optuna.create_study(direction=\"maximize\")\n", - " study.optimize(objective, n_trials=50)\n", - "\n", - " # 최적 하이퍼파라미터 출력\n", - " best_params = study.best_params\n", - "\n", - " # 최적 하이퍼파라미터로 CTGAN 학습 및 샘플 생성\n", - " ctgan = CTGAN(\n", - " embedding_dim=best_params[\"embedding_dim\"],\n", - " generator_dim=best_params[\"generator_dim\"],\n", - " discriminator_dim=best_params[\"discriminator_dim\"],\n", - " batch_size=best_params[\"batch_size\"],\n", - " discriminator_steps=best_params[\"discriminator_steps\"],\n", - " pac=best_params[\"pac\"]\n", - " )\n", - "\n", - " # 범주 0 데이터로 최종 학습\n", - " ctgan.fit(lerp_data[lerp_data['multi_class'] == 0], discrete_columns=categorical_features)\n", - " generated_0 = ctgan.sample(10000)\n", - "\n", - " # 범주 1 데이터 최적화 및 생성\n", - " def objective_class1(trial):\n", - " embedding_dim = trial.suggest_int(\"embedding_dim\", 128, 512)\n", - " generator_dim = trial.suggest_categorical(\"generator_dim\", [(128, 128), (256, 256)])\n", - " discriminator_dim = trial.suggest_categorical(\"discriminator_dim\", [(128, 128), (256, 256)])\n", - " pac = trial.suggest_categorical(\"pac\", [4, 8])\n", - " batch_size = trial.suggest_categorical(\"batch_size\", [256, 512, 1024])\n", - " discriminator_steps = trial.suggest_int(\"discriminator_steps\", 1, 5)\n", - "\n", - " ctgan = CTGAN(\n", - " embedding_dim=embedding_dim,\n", - " generator_dim=generator_dim,\n", - " discriminator_dim=discriminator_dim,\n", - " batch_size=batch_size,\n", - " discriminator_steps=discriminator_steps,\n", - " pac=pac\n", - " )\n", - "\n", - " data_1 = lerp_data[lerp_data['multi_class'] == 1]\n", - " ctgan.fit(data_1, discrete_columns=categorical_features)\n", - " generated_data = ctgan.sample(len(data_1) * 2)\n", - "\n", - " real_visi = data_1['visi']\n", - " generated_visi = generated_data['visi']\n", - " mse = ((real_visi.mean() - generated_visi.mean())**2 + (real_visi.std() - generated_visi.std())**2)\n", - " return -mse\n", - "\n", - " study_class1 = optuna.create_study(direction=\"maximize\")\n", - " study_class1.optimize(objective_class1, n_trials=30)\n", - "\n", - " best_params_class1 = study_class1.best_params\n", - " ctgan = CTGAN(\n", - " embedding_dim=best_params_class1[\"embedding_dim\"],\n", - " generator_dim=best_params_class1[\"generator_dim\"],\n", - " discriminator_dim=best_params_class1[\"discriminator_dim\"],\n", - " batch_size=best_params_class1[\"batch_size\"],\n", - " discriminator_steps=best_params_class1[\"discriminator_steps\"],\n", - " pac=best_params_class1[\"pac\"]\n", - " )\n", - "\n", - " ctgan.fit(lerp_data[lerp_data['multi_class'] == 1], discrete_columns=categorical_features)\n", - " generated_1 = ctgan.sample(10000 - int(np.ceil(count_class_1 / 100) * 100))\n", - "\n", - " # 데이터 병합 및 저장\n", - " well_generated0 = generated_0[(generated_0['visi'] >= 0) & (generated_0['visi'] < 100)]\n", - " well_generated1 = generated_1[(generated_1['visi'] >= 100) & (generated_1['visi'] < 500)]\n", - " smote_gan_data = pd.concat([lerp_data, well_generated0, well_generated1], axis=0)\n", - " # 제거변수 복구\n", - " smote_gan_data['binary_class'] = smote_gan_data['multi_class'].apply(lambda x: 0 if x == 2 else 1)\n", - " smote_gan_data['hour_sin'] = np.sin(2 * np.pi * smote_gan_data['hour'] / 24)\n", - " smote_gan_data['hour_cos'] = np.cos(2 * np.pi * smote_gan_data['hour'] / 24)\n", - " smote_gan_data['month_sin'] = np.sin(2 * np.pi * smote_gan_data['month'] / 12)\n", - " smote_gan_data['month_cos'] = np.cos(2 * np.pi * smote_gan_data['month'] / 12)\n", - " smote_gan_data['ground_temp - temp_C'] = smote_gan_data['groundtemp'] - smote_gan_data['temp_C']\n", - "\n", - " filtered_data = smote_gan_data[smote_gan_data['multi_class'] != 2]\n", - " original_class2 = data[data['multi_class'] == 2]\n", - " final_data = pd.concat([filtered_data, original_class2], axis=0)\n", - " final_data.reset_index(drop=True, inplace=True)\n", - "\n", - " # 결과 저장\n", - " final_data.to_csv(output_path, index = False)\n", - " print(len(final_data[final_data['multi_class']==0]),'|',len(final_data[final_data['multi_class']==1]),'|',len(final_data[final_data['multi_class']==2]))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 소수범주당 증강 목표 7000개" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "from imblearn.over_sampling import SMOTENC\n", - "import optuna\n", - "from ctgan import CTGAN\n", - "import torch\n", - "import warnings\n", - "\n", - "# 지역별 데이터 파일 경로\n", - "regions = ['busan', 'daegu', 'daejeon', 'gwangju', 'incheon', 'seoul']\n", - "file_paths = [f'../data/data_for_modeling/{region}_train.csv' for region in regions]\n", - "output_paths = [f'../data/data_oversampled/ctgan7000_{region}.csv' for region in regions]\n", - "\n", - "# GPU 사용 설정\n", - "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", - "print(f\"Using device: {device}\")\n", - "\n", - "# 경고 무시\n", - "warnings.filterwarnings(\"ignore\", category=UserWarning, module=\"optuna.distributions\")\n", - "\n", - "# 지역별 처리\n", - "for file_path, output_path in zip(file_paths, output_paths):\n", - " # 데이터 로드\n", - " data = pd.read_csv(file_path, index_col=0)\n", - " X = data.drop(columns=['multi_class', 'binary_class'])\n", - " y = data['multi_class']\n", - "\n", - " # 불필요한 열 제거\n", - " X.drop(columns=['ground_temp - temp_C', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos'], inplace=True)\n", - "\n", - " # SMOTENC에서 사용할 범주형 변수 열 번호 설정\n", - " categorical_features_indices = [i for i, dtype in enumerate(X.dtypes) if dtype != 'float64']\n", - "\n", - " # sampling_strategy 설정\n", - " count_class_0 = (y == 0).sum()\n", - " count_class_1 = (y == 1).sum()\n", - " count_class_2 = (y == 2).sum()\n", - " sampling_strategy = {\n", - " 0: 500 if count_class_0 <= 500 else 1000,\n", - " 1: int(np.ceil(count_class_1 / 100) * 100), # 백의 자리로 올림\n", - " 2: count_class_2\n", - " }\n", - "\n", - " # SMOTENC 적용\n", - " smotenc = SMOTENC(categorical_features=categorical_features_indices, sampling_strategy=sampling_strategy, random_state=42)\n", - " X_resampled, y_resampled = smotenc.fit_resample(X, y)\n", - "\n", - " # Resampled 데이터 생성\n", - " lerp_data = X_resampled.copy()\n", - " lerp_data['multi_class'] = y_resampled\n", - "\n", - " # CTGAN에서 사용할 범주형 변수 열 이름 설정\n", - " categorical_features = [\n", - " col for col, dtype in zip(lerp_data.columns, lerp_data.dtypes) if dtype != 'float64'\n", - " ]\n", - "\n", - " # Optuna 목적 함수 정의\n", - " def objective(trial):\n", - " # 하이퍼파라미터 탐색 범위 설정\n", - " embedding_dim = trial.suggest_int(\"embedding_dim\", 64, 128)\n", - " generator_dim = trial.suggest_categorical(\"generator_dim\", [(64, 64), (128, 128)])\n", - " discriminator_dim = trial.suggest_categorical(\"discriminator_dim\", [(64, 64), (128, 128)])\n", - " pac = trial.suggest_categorical(\"pac\", [4, 8])\n", - " batch_size = trial.suggest_categorical(\"batch_size\", [64, 128, 256])\n", - " discriminator_steps = trial.suggest_int(\"discriminator_steps\", 1, 3)\n", - "\n", - " # CTGAN 모델 생성\n", - " ctgan = CTGAN(\n", - " embedding_dim=embedding_dim,\n", - " generator_dim=generator_dim,\n", - " discriminator_dim=discriminator_dim,\n", - " batch_size=batch_size,\n", - " discriminator_steps=discriminator_steps,\n", - " pac=pac\n", - " )\n", - "\n", - " # 범주 0 데이터 필터링\n", - " data_0 = lerp_data[lerp_data['multi_class'] == 0]\n", - "\n", - " # 모델 학습\n", - " ctgan.fit(data_0, discrete_columns=categorical_features)\n", - "\n", - " # 샘플 생성\n", - " generated_data = ctgan.sample(len(data_0) * 2)\n", - "\n", - " # 평가: 샘플의 연속형 변수 분포 비교\n", - " real_visi = data_0['visi']\n", - " generated_visi = generated_data['visi']\n", - "\n", - " # 분포 간 차이(MSE) 계산\n", - " mse = ((real_visi.mean() - generated_visi.mean())**2 + (real_visi.std() - generated_visi.std())**2)\n", - " return -mse\n", - "\n", - " # Optuna로 최적화 수행\n", - " study = optuna.create_study(direction=\"maximize\")\n", - " study.optimize(objective, n_trials=50)\n", - "\n", - " # 최적 하이퍼파라미터 출력\n", - " best_params = study.best_params\n", - "\n", - " # 최적 하이퍼파라미터로 CTGAN 학습 및 샘플 생성\n", - " ctgan = CTGAN(\n", - " embedding_dim=best_params[\"embedding_dim\"],\n", - " generator_dim=best_params[\"generator_dim\"],\n", - " discriminator_dim=best_params[\"discriminator_dim\"],\n", - " batch_size=best_params[\"batch_size\"],\n", - " discriminator_steps=best_params[\"discriminator_steps\"],\n", - " pac=best_params[\"pac\"]\n", - " )\n", - "\n", - " # 범주 0 데이터로 최종 학습\n", - " ctgan.fit(lerp_data[lerp_data['multi_class'] == 0], discrete_columns=categorical_features)\n", - " generated_0 = ctgan.sample(7000)\n", - "\n", - " # 범주 1 데이터 최적화 및 생성\n", - " def objective_class1(trial):\n", - " embedding_dim = trial.suggest_int(\"embedding_dim\", 128, 512)\n", - " generator_dim = trial.suggest_categorical(\"generator_dim\", [(128, 128), (256, 256)])\n", - " discriminator_dim = trial.suggest_categorical(\"discriminator_dim\", [(128, 128), (256, 256)])\n", - " pac = trial.suggest_categorical(\"pac\", [4, 8])\n", - " batch_size = trial.suggest_categorical(\"batch_size\", [256, 512, 1024])\n", - " discriminator_steps = trial.suggest_int(\"discriminator_steps\", 1, 5)\n", - "\n", - " ctgan = CTGAN(\n", - " embedding_dim=embedding_dim,\n", - " generator_dim=generator_dim,\n", - " discriminator_dim=discriminator_dim,\n", - " batch_size=batch_size,\n", - " discriminator_steps=discriminator_steps,\n", - " pac=pac\n", - " )\n", - "\n", - " data_1 = lerp_data[lerp_data['multi_class'] == 1]\n", - " ctgan.fit(data_1, discrete_columns=categorical_features)\n", - " generated_data = ctgan.sample(len(data_1) * 2)\n", - "\n", - " real_visi = data_1['visi']\n", - " generated_visi = generated_data['visi']\n", - " mse = ((real_visi.mean() - generated_visi.mean())**2 + (real_visi.std() - generated_visi.std())**2)\n", - " return -mse\n", - "\n", - " study_class1 = optuna.create_study(direction=\"maximize\")\n", - " study_class1.optimize(objective_class1, n_trials=30)\n", - "\n", - " best_params_class1 = study_class1.best_params\n", - " ctgan = CTGAN(\n", - " embedding_dim=best_params_class1[\"embedding_dim\"],\n", - " generator_dim=best_params_class1[\"generator_dim\"],\n", - " discriminator_dim=best_params_class1[\"discriminator_dim\"],\n", - " batch_size=best_params_class1[\"batch_size\"],\n", - " discriminator_steps=best_params_class1[\"discriminator_steps\"],\n", - " pac=best_params_class1[\"pac\"]\n", - " )\n", - "\n", - " ctgan.fit(lerp_data[lerp_data['multi_class'] == 1], discrete_columns=categorical_features)\n", - " generated_1 = ctgan.sample(7000 - int(np.ceil(count_class_1 / 100) * 100))\n", - "\n", - " # 데이터 병합 및 저장\n", - " well_generated0 = generated_0[(generated_0['visi'] >= 0) & (generated_0['visi'] < 100)]\n", - " well_generated1 = generated_1[(generated_1['visi'] >= 100) & (generated_1['visi'] < 500)]\n", - " smote_gan_data = pd.concat([lerp_data, well_generated0, well_generated1], axis=0)\n", - " # 제거변수 복구\n", - " smote_gan_data['binary_class'] = smote_gan_data['multi_class'].apply(lambda x: 0 if x == 2 else 1)\n", - " smote_gan_data['hour_sin'] = np.sin(2 * np.pi * smote_gan_data['hour'] / 24)\n", - " smote_gan_data['hour_cos'] = np.cos(2 * np.pi * smote_gan_data['hour'] / 24)\n", - " smote_gan_data['month_sin'] = np.sin(2 * np.pi * smote_gan_data['month'] / 12)\n", - " smote_gan_data['month_cos'] = np.cos(2 * np.pi * smote_gan_data['month'] / 12)\n", - " smote_gan_data['ground_temp - temp_C'] = smote_gan_data['groundtemp'] - smote_gan_data['temp_C']\n", - "\n", - " filtered_data = smote_gan_data[smote_gan_data['multi_class'] != 2]\n", - " original_class2 = data[data['multi_class'] == 2]\n", - " final_data = pd.concat([filtered_data, original_class2], axis=0)\n", - " final_data.reset_index(drop=True, inplace=True)\n", - "\n", - " # 결과 저장\n", - " final_data.to_csv(output_path, index = False)\n", - " print(len(final_data[final_data['multi_class']==0]),'|',len(final_data[final_data['multi_class']==1]),'|',len(final_data[final_data['multi_class']==2]))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}