Eric2333 Claude Opus 4.6 (1M context) commited on
Commit
01409c9
·
1 Parent(s): c568b76

Fix finetuning RuntimeError and add pyproject.toml

Browse files

- Fix block_size mismatch causing "params, grads, exp_avgs, and
exp_avg_sqs must have same dtype" RuntimeError when finetuning with
block_size different from checkpoint. Set finetuning config
block_size=1024 to match pretrained model_updated.pt.
- Add warning when config model parameters differ from checkpoint to
help users diagnose shape mismatch issues.
- Fix GradScaler logic: enabled=(dtype=='float32') was inverted,
corrected to enabled=(dtype=='float16').
- Add pyproject.toml with complete dependency list (transformers, tqdm,
tiktoken were missing) and project metadata.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Files changed (3) hide show
  1. configs/example_finetuning.py +1 -1
  2. pyproject.toml +16 -0
  3. train.py +15 -1
configs/example_finetuning.py CHANGED
@@ -9,7 +9,7 @@ log_dir = os.path.join(out_dir, 'logs') # logs will be written in to out_dir/log
9
  # -----------------------------------------------------------------------------
10
  # model parameters
11
  meta_vocab_size = 1024
12
- block_size = 256
13
  n_layer=24
14
  n_head=16
15
  n_embd=1024 # 350M, medium
 
9
  # -----------------------------------------------------------------------------
10
  # model parameters
11
  meta_vocab_size = 1024
12
+ block_size = 1024
13
  n_layer=24
14
  n_head=16
15
  n_embd=1024 # 350M, medium
pyproject.toml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ readme = "README.md"
3
+ license = {text = "MIT"}
4
+ requires-python = ">=3.10"
5
+
6
+ dependencies = [
7
+ "torch>=2.0",
8
+ "numpy",
9
+ "pynvml",
10
+ "transformers",
11
+ "tqdm",
12
+ "tiktoken",
13
+ ]
14
+
15
+ [project.urls]
16
+ Repository = "https://huggingface.co/zhaoyichong/GenerRNA"
train.py CHANGED
@@ -125,6 +125,20 @@ elif init_from == 'resume':
125
  # the rest of the attributes (e.g. dropout) can stay as desired from command line
126
  for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
127
  model_args[k] = checkpoint_model_args[k]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  # create the model
129
  gptconf = GPTConfig(**model_args)
130
  model = GPT(gptconf)
@@ -143,7 +157,7 @@ if block_size < model.config.block_size:
143
  model.to(device)
144
 
145
  # initialize a GradScaler. If enabled=False scaler is a no-op
146
- scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float32'))
147
 
148
  # optimizer
149
  optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)
 
125
  # the rest of the attributes (e.g. dropout) can stay as desired from command line
126
  for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
127
  model_args[k] = checkpoint_model_args[k]
128
+ # check config vs checkpoint model parameter consistency
129
+ mismatches = []
130
+ for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias']:
131
+ config_val = globals().get(k)
132
+ ckpt_val = checkpoint_model_args[k]
133
+ if config_val is not None and config_val != ckpt_val:
134
+ mismatches.append(f" {k}: config={config_val}, checkpoint={ckpt_val}")
135
+ if mismatches:
136
+ print("WARNING: config and checkpoint model parameters differ:")
137
+ for m in mismatches:
138
+ print(m)
139
+ print("The checkpoint values will be used. If you intended to use a different block_size,")
140
+ print("note that crop_block_size + optimizer state loading will cause a RuntimeError.")
141
+ print("Either match block_size to the checkpoint or do not load the optimizer state.")
142
  # create the model
143
  gptconf = GPTConfig(**model_args)
144
  model = GPT(gptconf)
 
157
  model.to(device)
158
 
159
  # initialize a GradScaler. If enabled=False scaler is a no-op
160
+ scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
161
 
162
  # optimizer
163
  optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)