Fix finetuning RuntimeError and add pyproject.toml
Browse files- Fix block_size mismatch causing "params, grads, exp_avgs, and
exp_avg_sqs must have same dtype" RuntimeError when finetuning with
block_size different from checkpoint. Set finetuning config
block_size=1024 to match pretrained model_updated.pt.
- Add warning when config model parameters differ from checkpoint to
help users diagnose shape mismatch issues.
- Fix GradScaler logic: enabled=(dtype=='float32') was inverted,
corrected to enabled=(dtype=='float16').
- Add pyproject.toml with complete dependency list (transformers, tqdm,
tiktoken were missing) and project metadata.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
- configs/example_finetuning.py +1 -1
- pyproject.toml +16 -0
- train.py +15 -1
configs/example_finetuning.py
CHANGED
|
@@ -9,7 +9,7 @@ log_dir = os.path.join(out_dir, 'logs') # logs will be written in to out_dir/log
|
|
| 9 |
# -----------------------------------------------------------------------------
|
| 10 |
# model parameters
|
| 11 |
meta_vocab_size = 1024
|
| 12 |
-
block_size =
|
| 13 |
n_layer=24
|
| 14 |
n_head=16
|
| 15 |
n_embd=1024 # 350M, medium
|
|
|
|
| 9 |
# -----------------------------------------------------------------------------
|
| 10 |
# model parameters
|
| 11 |
meta_vocab_size = 1024
|
| 12 |
+
block_size = 1024
|
| 13 |
n_layer=24
|
| 14 |
n_head=16
|
| 15 |
n_embd=1024 # 350M, medium
|
pyproject.toml
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
readme = "README.md"
|
| 3 |
+
license = {text = "MIT"}
|
| 4 |
+
requires-python = ">=3.10"
|
| 5 |
+
|
| 6 |
+
dependencies = [
|
| 7 |
+
"torch>=2.0",
|
| 8 |
+
"numpy",
|
| 9 |
+
"pynvml",
|
| 10 |
+
"transformers",
|
| 11 |
+
"tqdm",
|
| 12 |
+
"tiktoken",
|
| 13 |
+
]
|
| 14 |
+
|
| 15 |
+
[project.urls]
|
| 16 |
+
Repository = "https://huggingface.co/zhaoyichong/GenerRNA"
|
train.py
CHANGED
|
@@ -125,6 +125,20 @@ elif init_from == 'resume':
|
|
| 125 |
# the rest of the attributes (e.g. dropout) can stay as desired from command line
|
| 126 |
for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
|
| 127 |
model_args[k] = checkpoint_model_args[k]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
# create the model
|
| 129 |
gptconf = GPTConfig(**model_args)
|
| 130 |
model = GPT(gptconf)
|
|
@@ -143,7 +157,7 @@ if block_size < model.config.block_size:
|
|
| 143 |
model.to(device)
|
| 144 |
|
| 145 |
# initialize a GradScaler. If enabled=False scaler is a no-op
|
| 146 |
-
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == '
|
| 147 |
|
| 148 |
# optimizer
|
| 149 |
optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)
|
|
|
|
| 125 |
# the rest of the attributes (e.g. dropout) can stay as desired from command line
|
| 126 |
for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
|
| 127 |
model_args[k] = checkpoint_model_args[k]
|
| 128 |
+
# check config vs checkpoint model parameter consistency
|
| 129 |
+
mismatches = []
|
| 130 |
+
for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias']:
|
| 131 |
+
config_val = globals().get(k)
|
| 132 |
+
ckpt_val = checkpoint_model_args[k]
|
| 133 |
+
if config_val is not None and config_val != ckpt_val:
|
| 134 |
+
mismatches.append(f" {k}: config={config_val}, checkpoint={ckpt_val}")
|
| 135 |
+
if mismatches:
|
| 136 |
+
print("WARNING: config and checkpoint model parameters differ:")
|
| 137 |
+
for m in mismatches:
|
| 138 |
+
print(m)
|
| 139 |
+
print("The checkpoint values will be used. If you intended to use a different block_size,")
|
| 140 |
+
print("note that crop_block_size + optimizer state loading will cause a RuntimeError.")
|
| 141 |
+
print("Either match block_size to the checkpoint or do not load the optimizer state.")
|
| 142 |
# create the model
|
| 143 |
gptconf = GPTConfig(**model_args)
|
| 144 |
model = GPT(gptconf)
|
|
|
|
| 157 |
model.to(device)
|
| 158 |
|
| 159 |
# initialize a GradScaler. If enabled=False scaler is a no-op
|
| 160 |
+
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
|
| 161 |
|
| 162 |
# optimizer
|
| 163 |
optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)
|