| from state import Model |
|
|
| |
| GEMMA3_270M = Model( |
| vocab_size=262144, |
| num_layers=18, |
| hidden_dim=640, |
| intermediate_size=2048, |
| weight_tied_embeddings=True, |
| active_experts=1, |
| total_experts=1, |
| is_moe=False, |
| ) |
| GEMMA3_1B = Model( |
| vocab_size=262144, |
| num_layers=26, |
| hidden_dim=1152, |
| intermediate_size=6912, |
| weight_tied_embeddings=True, |
| active_experts=1, |
| total_experts=1, |
| is_moe=False, |
| ) |
| GEMMA3_4B = Model( |
| vocab_size=262144, |
| num_layers=34, |
| hidden_dim=2560, |
| intermediate_size=10240, |
| weight_tied_embeddings=True, |
| active_experts=1, |
| total_experts=1, |
| is_moe=False, |
| ) |
| GEMMA3_12B = Model( |
| vocab_size=262144, |
| num_layers=48, |
| hidden_dim=3840, |
| intermediate_size=15360, |
| weight_tied_embeddings=True, |
| active_experts=1, |
| total_experts=1, |
| is_moe=False, |
| ) |
| GEMMA3_27B = Model( |
| vocab_size=262144, |
| num_layers=62, |
| hidden_dim=5376, |
| intermediate_size=21504, |
| weight_tied_embeddings=True, |
| active_experts=1, |
| total_experts=1, |
| is_moe=False, |
| ) |
| |
|
|
| |
| LLAMA4_SCOUT = Model( |
| vocab_size=202048, |
| num_layers=48, |
| hidden_dim=5120, |
| intermediate_size=8192, |
| weight_tied_embeddings=True, |
| active_experts=2, |
| total_experts=17, |
| is_moe=True, |
| ) |
|
|
| |
| LLAMA3_1B = Model( |
| vocab_size=128256, |
| num_layers=16, |
| hidden_dim=2048, |
| intermediate_size=8192, |
| weight_tied_embeddings=True, |
| active_experts=1, |
| total_experts=1, |
| is_moe=False, |
| ) |
|
|
| |
| LLAMA3_3B = Model( |
| vocab_size=128256, |
| num_layers=28, |
| hidden_dim=3072, |
| intermediate_size=8192, |
| weight_tied_embeddings=True, |
| active_experts=1, |
| total_experts=1, |
| is_moe=False, |
| ) |
|
|
| |
| LLAMA3_8B = Model( |
| vocab_size=128256, |
| num_layers=32, |
| hidden_dim=4096, |
| intermediate_size=14336, |
| weight_tied_embeddings=True, |
| active_experts=1, |
| total_experts=1, |
| is_moe=False, |
| ) |
|
|
| |
| LLAMA3_70B = Model( |
| vocab_size=128256, |
| num_layers=80, |
| hidden_dim=8192, |
| intermediate_size=28672, |
| weight_tied_embeddings=True, |
| active_experts=1, |
| total_experts=1, |
| is_moe=False, |
| ) |
|
|
| DEFAULTS = { |
| "Gemma3 270M": GEMMA3_270M, |
| "Gemma3 1B": GEMMA3_1B, |
| "Gemma3 4B": GEMMA3_4B, |
| "Gemma3 12B": GEMMA3_12B, |
| "Gemma3 27B": GEMMA3_27B, |
| "Llama3 1B": LLAMA3_1B, |
| "Llama3 3B": LLAMA3_3B, |
| "Llama3 8B": LLAMA3_8B, |
| "Llama3 70B": LLAMA3_70B, |
| "Llama4 Scout": LLAMA4_SCOUT, |
| } |
|
|