JamieYuu commited on
Commit
927cbc1
·
verified ·
1 Parent(s): 0bb4a75

Remove script-centric files from results

Browse files
results/requirements.txt DELETED
@@ -1,11 +0,0 @@
1
- torch>=2.2.0
2
- transformers>=4.40.0
3
- datasets>=2.18.0
4
- pandas>=2.2.0
5
- numpy>=1.26.0
6
- scikit-learn>=1.4.0
7
- pyarrow>=15.0.0
8
- fastparquet>=2024.2.0
9
- joblib>=1.3.0
10
- xgboost>=2.0.0
11
- peft>=0.11.0
 
 
 
 
 
 
 
 
 
 
 
 
results/run_end_to_end_bert.md DELETED
@@ -1,255 +0,0 @@
1
- # Run Commands: End-to-End BERT Training
2
-
3
- ## 1) Go to project folder
4
- ```bash
5
- cd /Users/jamie/Downloads/LLM_Project
6
- ```
7
-
8
- ## 2) Activate virtual environment
9
- ```bash
10
- source .venv/bin/activate
11
- ```
12
-
13
- ## 3) Train end-to-end BERT (full run)
14
- ```bash
15
- python bert.py \
16
- --model_name boltuix/bert-lite \
17
- --data_path enriched_news.parquet \
18
- --output_dir outputs_compare_models \
19
- --epochs 3 \
20
- --train_batch_size 16 \
21
- --max_length 128 \
22
- --learning_rate 2e-5 \
23
- --weight_decay 0.01 \
24
- --sample_frac 1.0 \
25
- --min_per_stratum 1 \
26
- --log_level INFO
27
- ```
28
-
29
- ## 4) Train end-to-end BERT (faster dev run on laptop)
30
- ```bash
31
- python bert.py \
32
- --model_name boltuix/bert-lite \
33
- --data_path enriched_news.parquet \
34
- --output_dir outputs_compare_models \
35
- --fast_mode \
36
- --sample_frac 0.35 \
37
- --min_per_stratum 50 \
38
- --log_level INFO
39
- ```
40
-
41
- ## 5) Train with LoRA (faster and lighter than full fine-tuning)
42
- ```bash
43
- python bert.py \
44
- --model_name kk08/CryptoBERT \
45
- --data_path enriched_news.parquet \
46
- --output_dir outputs_compare_models \
47
- --peft_mode lora \
48
- --lora_r 8 \
49
- --lora_alpha 16 \
50
- --lora_dropout 0.05 \
51
- --lora_target_modules query,key,value \
52
- --train_batch_size 8 \
53
- --grad_accum_steps 2 \
54
- --num_workers 2 \
55
- --eval_every_epochs 1 \
56
- --sample_frac 0.5 \
57
- --min_per_stratum 50 \
58
- --log_level INFO
59
- ```
60
-
61
- ## 6) Extra speed-focused run (LoRA + fast mode)
62
- ```bash
63
- python bert.py \
64
- --model_name boltuix/bert-lite \
65
- --data_path enriched_news.parquet \
66
- --output_dir outputs_compare_models \
67
- --peft_mode lora \
68
- --fast_mode \
69
- --grad_accum_steps 2 \
70
- --num_workers 2 \
71
- --sample_frac 0.35 \
72
- --min_per_stratum 50 \
73
- --log_level INFO
74
- ```
75
-
76
- ## 7) Output files to check
77
- - `outputs_compare_models/best_neural_model.pt`
78
- - `outputs_compare_models/metrics_summary_neural.json`
79
- - `outputs_compare_models/numeric_scaler.joblib`
80
- - `outputs_compare_models/fng_onehot_encoder.joblib`
81
-
82
- ## 8) Optional: freeze BERT (not end-to-end fine-tuning)
83
- ```bash
84
- python bert.py --freeze_bert
85
- ```
86
-
87
- ## 9) Train XGBoost only (frozen CLS + numeric)
88
- ```bash
89
- python xgb.py \
90
- --model_name gaunernst/bert-mini-uncased \
91
- --data_path enriched_news.parquet \
92
- --output_json outputs_compare_models/metrics_xgb.json \
93
- --sample_frac 1.0 \
94
- --min_per_stratum 1 \
95
- --xgb_random_search \
96
- --xgb_random_iters 20 \
97
- --xgb_refit_on_train_val \
98
- --log_level INFO
99
- ```
100
-
101
- ## 10) Train XGBoost only (faster run)
102
- ```bash
103
- python xgb.py \
104
- --model_name gaunernst/bert-mini-uncased \
105
- --data_path enriched_news.parquet \
106
- --sample_frac 0.35 \
107
- --min_per_stratum 50 \
108
- --embedding_batch_size 32 \
109
- --max_length 96 \
110
- --xgb_n_estimators 200 \
111
- --xgb_max_depth 4 \
112
- --xgb_random_search \
113
- --xgb_random_iters 10 \
114
- --xgb_refit_on_train_val \
115
- --log_level INFO
116
- ```
117
-
118
- ## 11) Compare numeric-only XGB vs frozen FinBERT CLS+numeric XGB
119
- ```bash
120
- python compare_xgb_text_vs_numeric.py \
121
- --embedding_source frozen \
122
- --model_name ProsusAI/finbert \
123
- --data_path enriched_news.parquet \
124
- --sample_frac 1.0 \
125
- --min_per_stratum 1 \
126
- --xgb_random_search \
127
- --xgb_random_iters 20 \
128
- --xgb_refit_on_train_val \
129
- --log_level INFO
130
- ```
131
-
132
- ## 12) Compare numeric-only XGB vs fine-tuned CLS+numeric XGB
133
- ```bash
134
- python compare_xgb_text_vs_numeric.py \
135
- --embedding_source finetuned \
136
- --model_name boltuix/bert-lite \
137
- --finetuned_checkpoint outputs_compare_models/best_neural_model_base_boltuix_bert-lite.pt \
138
- --tokenizer_path outputs_compare_models \
139
- --data_path enriched_news.parquet \
140
- --sample_frac 1.0 \
141
- --min_per_stratum 1 \
142
- --xgb_random_search \
143
- --xgb_random_iters 20 \
144
- --xgb_refit_on_train_val \
145
- --log_level INFO
146
- ```
147
-
148
- ## 13) Compare script (faster run)
149
- ```bash
150
- python compare_xgb_text_vs_numeric.py \
151
- --embedding_source frozen \
152
- --model_name ProsusAI/finbert \
153
- --sample_frac 0.35 \
154
- --min_per_stratum 50 \
155
- --embedding_batch_size 32 \
156
- --max_length 96 \
157
- --xgb_random_search \
158
- --xgb_random_iters 10 \
159
- --xgb_refit_on_train_val \
160
- --log_level INFO
161
- ```
162
-
163
- ## 14) Typical workflow (recommended order)
164
- ```bash
165
- # Step A: train neural model (full or LoRA)
166
- python bert.py --peft_mode lora --fast_mode --sample_frac 0.35 --min_per_stratum 50 --log_level INFO
167
-
168
- # Step B: compare numeric-only vs fine-tuned CLS+numeric
169
- python compare_xgb_text_vs_numeric.py --finetuned_checkpoint outputs_compare_models/best_neural_model.pt --tokenizer_path outputs_compare_models --sample_frac 0.35 --min_per_stratum 50 --log_level INFO
170
-
171
- # Optional Step C: run frozen-CLS XGB baseline script directly
172
- python xgb.py --sample_frac 0.35 --min_per_stratum 50 --xgb_random_search --xgb_refit_on_train_val --log_level INFO
173
- ```
174
-
175
- ## 15) Useful output files from other scripts
176
- - `outputs_compare_models/metrics_xgb.json` (from `xgb.py`)
177
- - `outputs_compare_models/metrics_xgb_cls_vs_numeric.json` (from `compare_xgb_text_vs_numeric.py`)
178
-
179
- ## 16) Embedding cache safety (important)
180
- - Caches now use an auto key derived from model + sample settings + seed + max length.
181
- - This makes cache reuse safer when you change `sample_frac`, model, or other settings.
182
- - Optional manual override (compare script):
183
- ```bash
184
- python compare_xgb_text_vs_numeric.py \
185
- --embedding_source frozen \
186
- --model_name ProsusAI/finbert \
187
- --embedding_cache_key my_custom_cache_v1
188
- ```
189
- - If you want strict isolation per experiment, use a separate cache folder too:
190
- ```bash
191
- python compare_xgb_text_vs_numeric.py \
192
- --embedding_source frozen \
193
- --model_name ProsusAI/finbert \
194
- --embedding_cache_dir embedding_cache_xgb_compare_exp1
195
- ```
196
-
197
- # After getting correct embeddings (this takes a hour to run!!!)
198
- ```bash
199
- python compare_xgb_text_vs_numeric.py \
200
- --embedding_source frozen \
201
- --model_name ProsusAI/finbert \
202
- --data_path enriched_news.parquet \
203
- --sample_frac 1.0 \
204
- --min_per_stratum 1 \
205
- --embedding_cache_dir embedding_cache_xgb_compare_fresh_20260316 \
206
- --embedding_cache_key finbert_full_fresh \
207
- --xgb_random_iters 1 \
208
- --xgb_n_estimators 100 \
209
- --xgb_max_depth 3 \
210
- --log_level INFO
211
- ```
212
-
213
- Copy-safe single line:
214
- ```bash
215
- python compare_xgb_text_vs_numeric.py --embedding_source frozen --model_name ProsusAI/finbert --data_path enriched_news.parquet --sample_frac 1.0 --min_per_stratum 1 --embedding_cache_dir embedding_cache_xgb_compare_fresh_20260316 --embedding_cache_key finbert_full_fresh --xgb_random_iters 1 --xgb_n_estimators 100 --xgb_max_depth 3 --log_level INFO
216
- ```
217
-
218
- # Fast version of frozen cls (bert-lite) - similar performance against FinBert
219
- ```bash
220
- python compare_xgb_text_vs_numeric.py \
221
- --embedding_source frozen \
222
- --model_name boltuix/bert-lite \
223
- --data_path enriched_news.parquet \
224
- --sample_frac 1.0 \
225
- --min_per_stratum 1 \
226
- --embedding_cache_dir embedding_cache_xgb_compare_fresh_20260316 \
227
- --embedding_cache_key bertlite_full_fresh \
228
- --log_level INFO
229
- ```
230
-
231
- ## 17) Reproducible run settings (recommended)
232
- - Use fixed `--seed` and fixed cache key.
233
- - Use `--deterministic_run` (forces single-thread XGBoost/CV workers).
234
- - Keep data/sampling settings unchanged across reruns.
235
-
236
- ```bash
237
- python compare_xgb_text_vs_numeric.py \
238
- --embedding_source frozen \
239
- --model_name boltuix/bert-lite \
240
- --data_path enriched_news.parquet \
241
- --sample_frac 1.0 \
242
- --min_per_stratum 1 \
243
- --seed 42 \
244
- --embedding_cache_dir embedding_cache_xgb_compare_fresh_20260316 \
245
- --embedding_cache_key bertlite_full_fresh \
246
- --deterministic_run \
247
- --xgb_random_iters 20 \
248
- --xgb_refit_on_train_val \
249
- --log_level INFO
250
- ```
251
-
252
- Optional: if you do not use `--deterministic_run`, set workers manually:
253
- ```bash
254
- python compare_xgb_text_vs_numeric.py --xgb_n_jobs 1
255
- ```