Charlie81 commited on
Commit
582cd6b
·
1 Parent(s): e3a54b7

update training script

Browse files
Files changed (1) hide show
  1. scripts/train.py +40 -40
scripts/train.py CHANGED
@@ -5,6 +5,7 @@ from datasets import load_dataset
5
  from myolmoe.modeling_myolmoe import MyOlmoeForCausalLM, OlmoeConfig
6
  from torch.utils.data import Dataset
7
  import os
 
8
 
9
  class CustomDataset(Dataset):
10
  def __init__(self, tokenizer, dataset_name="allenai/tulu-v2-sft-mixture", max_length=512):
@@ -40,67 +41,67 @@ def expand_model_with_small_experts(base_model):
40
  # Create new model with expanded architecture
41
  expanded_model = MyOlmoeForCausalLM(config)
42
 
43
- # 1. Copy all non-expert weights exactly
44
- base_state_dict = base_model.state_dict()
45
- expanded_state_dict = expanded_model.state_dict()
46
 
47
  # Copy all non-expert parameters
48
- for name, param in base_state_dict.items():
49
- if "experts" not in name: # Skip expert-specific parameters
50
- expanded_state_dict[name].copy_(param)
51
-
52
- # 2. Copy the original experts' weights
53
- for i in range(config.num_experts):
54
- # Copy gate_proj weights
55
- expanded_state_dict[f'model.layers.{i}.mlp.experts.{i}.gate_proj.weight'].copy_(
56
- base_state_dict[f'model.layers.{i}.mlp.experts.{i}.gate_proj.weight'][:config.small_expert_intermediate_size]
57
- )
58
- # Copy up_proj weights
59
- expanded_state_dict[f'model.layers.{i}.mlp.experts.{i}.up_proj.weight'].copy_(
60
- base_state_dict[f'model.layers.{i}.mlp.experts.{i}.up_proj.weight'][:config.small_expert_intermediate_size]
61
- )
62
- # Copy down_proj weights (need to handle output dimension differently)
63
- expanded_state_dict[f'model.layers.{i}.mlp.experts.{i}.down_proj.weight'].copy_(
64
- base_state_dict[f'model.layers.{i}.mlp.experts.{i}.down_proj.weight'][:,:config.small_expert_intermediate_size]
65
- )
66
-
67
- # 3. Initialize the gate layer for all experts (original + small)
68
- # The original gate had shape (hidden_size, num_experts)
69
- # New gate needs shape (hidden_size, num_experts + num_small_experts)
70
- for i in range(config.num_hidden_layers):
71
- original_gate = base_state_dict[f'model.layers.{i}.mlp.gate.weight']
72
- new_gate = expanded_state_dict[f'model.layers.{i}.mlp.gate.weight']
73
 
74
  # Copy original gate weights
75
- new_gate[:, :config.num_experts].copy_(original_gate)
76
 
77
- # Initialize small experts gate weights (could use different initialization)
78
  torch.nn.init.normal_(
79
- new_gate[:, config.num_experts:],
80
  mean=0.0,
81
  std=config.initializer_range
82
  )
83
 
84
- # Load the combined state dict into the new model
85
- expanded_model.load_state_dict(expanded_state_dict)
86
 
87
  return expanded_model
88
 
89
  def main():
90
  # Load base model (with only 64 experts)
91
  model_path = "myolmoe"
 
92
  base_model = MyOlmoeForCausalLM.from_pretrained(model_path)
93
 
94
  # Verify base model has only 64 experts
95
  print(f"Base model has {base_model.config.num_experts} experts")
96
 
97
  # Expand model to include small experts
 
98
  model = expand_model_with_small_experts(base_model)
99
 
100
  # Verify expanded model
101
  print(f"Expanded model has {model.config.num_experts} regular experts and {model.config.num_small_experts} small experts")
102
 
103
  # Prepare dataset
 
104
  tokenizer = AutoTokenizer.from_pretrained(model_path)
105
  dataset = CustomDataset(tokenizer)
106
 
@@ -118,11 +119,7 @@ def main():
118
  eval_steps=500,
119
  fp16=True,
120
  gradient_checkpointing=True,
121
- report_to="tensorboard",
122
- # Important: Only train the new parameters initially
123
- # Freeze original experts first, then unfreeze later
124
- # You may want to modify this based on your training strategy
125
- freeze_existing_experts=True
126
  )
127
 
128
  # Custom trainer to handle expert freezing
@@ -134,7 +131,7 @@ def main():
134
  if self.freeze_existing:
135
  # Freeze all original expert parameters
136
  for name, param in self.model.named_parameters():
137
- if "experts" in name and "small_experts" not in name:
138
  param.requires_grad = False
139
  print("Frozen original experts, only training small experts")
140
 
@@ -143,17 +140,20 @@ def main():
143
  args=training_args,
144
  train_dataset=dataset,
145
  eval_dataset=dataset,
146
- freeze_existing_experts=training_args.freeze_existing_experts
147
  )
148
 
149
  # Train
 
150
  trainer.train()
151
 
152
  # Save final model
153
  output_dir = "./final_model"
154
  os.makedirs(output_dir, exist_ok=True)
 
155
  model.save_pretrained(output_dir)
156
  tokenizer.save_pretrained(output_dir)
 
157
 
158
  if __name__ == "__main__":
159
  main()
 
5
  from myolmoe.modeling_myolmoe import MyOlmoeForCausalLM, OlmoeConfig
6
  from torch.utils.data import Dataset
7
  import os
8
+ from tqdm import tqdm
9
 
10
  class CustomDataset(Dataset):
11
  def __init__(self, tokenizer, dataset_name="allenai/tulu-v2-sft-mixture", max_length=512):
 
41
  # Create new model with expanded architecture
42
  expanded_model = MyOlmoeForCausalLM(config)
43
 
44
+ # 1. First load all compatible weights (excluding expert layers)
45
+ base_model_state_dict = base_model.state_dict()
46
+ expanded_model_state_dict = expanded_model.state_dict()
47
 
48
  # Copy all non-expert parameters
49
+ for name, param in tqdm(base_model_state_dict.items(), desc="Copying base weights"):
50
+ if "mlp.experts" not in name and "mlp.gate" not in name:
51
+ if name in expanded_model_state_dict:
52
+ expanded_model_state_dict[name].copy_(param)
53
+
54
+ # 2. Handle expert layers
55
+ for layer_idx in tqdm(range(config.num_hidden_layers), desc="Processing expert layers"):
56
+ # Copy original expert weights
57
+ for expert_idx in range(config.num_experts):
58
+ # Get the original expert weights
59
+ gate_proj_weight = base_model_state_dict[f'model.layers.{layer_idx}.mlp.experts.{expert_idx}.gate_proj.weight']
60
+ up_proj_weight = base_model_state_dict[f'model.layers.{layer_idx}.mlp.experts.{expert_idx}.up_proj.weight']
61
+ down_proj_weight = base_model_state_dict[f'model.layers.{layer_idx}.mlp.experts.{expert_idx}.down_proj.weight']
62
+
63
+ # Copy to new model (slicing for small experts not needed here since we're copying original experts)
64
+ expanded_model_state_dict[f'model.layers.{layer_idx}.mlp.experts.{expert_idx}.gate_proj.weight'].copy_(gate_proj_weight)
65
+ expanded_model_state_dict[f'model.layers.{layer_idx}.mlp.experts.{expert_idx}.up_proj.weight'].copy_(up_proj_weight)
66
+ expanded_model_state_dict[f'model.layers.{layer_idx}.mlp.experts.{expert_idx}.down_proj.weight'].copy_(down_proj_weight)
67
+
68
+ # Handle gate layer expansion
69
+ original_gate_weight = base_model_state_dict[f'model.layers.{layer_idx}.mlp.gate.weight']
70
+ new_gate_weight = expanded_model_state_dict[f'model.layers.{layer_idx}.mlp.gate.weight']
 
 
 
71
 
72
  # Copy original gate weights
73
+ new_gate_weight[:, :config.num_experts].copy_(original_gate_weight)
74
 
75
+ # Initialize small experts gate weights
76
  torch.nn.init.normal_(
77
+ new_gate_weight[:, config.num_experts:],
78
  mean=0.0,
79
  std=config.initializer_range
80
  )
81
 
82
+ # 3. Load the combined state dict into the new model
83
+ expanded_model.load_state_dict(expanded_model_state_dict, strict=False)
84
 
85
  return expanded_model
86
 
87
  def main():
88
  # Load base model (with only 64 experts)
89
  model_path = "myolmoe"
90
+ print("Loading base model...")
91
  base_model = MyOlmoeForCausalLM.from_pretrained(model_path)
92
 
93
  # Verify base model has only 64 experts
94
  print(f"Base model has {base_model.config.num_experts} experts")
95
 
96
  # Expand model to include small experts
97
+ print("Expanding model with small experts...")
98
  model = expand_model_with_small_experts(base_model)
99
 
100
  # Verify expanded model
101
  print(f"Expanded model has {model.config.num_experts} regular experts and {model.config.num_small_experts} small experts")
102
 
103
  # Prepare dataset
104
+ print("Preparing dataset...")
105
  tokenizer = AutoTokenizer.from_pretrained(model_path)
106
  dataset = CustomDataset(tokenizer)
107
 
 
119
  eval_steps=500,
120
  fp16=True,
121
  gradient_checkpointing=True,
122
+ report_to="tensorboard"
 
 
 
 
123
  )
124
 
125
  # Custom trainer to handle expert freezing
 
131
  if self.freeze_existing:
132
  # Freeze all original expert parameters
133
  for name, param in self.model.named_parameters():
134
+ if "mlp.experts" in name and "small_experts" not in name:
135
  param.requires_grad = False
136
  print("Frozen original experts, only training small experts")
137
 
 
140
  args=training_args,
141
  train_dataset=dataset,
142
  eval_dataset=dataset,
143
+ freeze_existing_experts=True
144
  )
145
 
146
  # Train
147
+ print("Starting training...")
148
  trainer.train()
149
 
150
  # Save final model
151
  output_dir = "./final_model"
152
  os.makedirs(output_dir, exist_ok=True)
153
+ print(f"Saving final model to {output_dir}")
154
  model.save_pretrained(output_dir)
155
  tokenizer.save_pretrained(output_dir)
156
+ print("Training complete!")
157
 
158
  if __name__ == "__main__":
159
  main()