| { |
| "n_layers": 8, |
| "d_model": 512, |
| "n_ctx": 59, |
| "d_head": 64, |
| "model_name": "Othello-GPT-Transformer-Lens", |
| "n_heads": 8, |
| "d_mlp": 2048, |
| "act_fn": "gelu", |
| "d_vocab": 61, |
| "eps": 1e-05, |
| "use_attn_result": false, |
| "use_attn_scale": true, |
| "attn_scale": 8.0, |
| "use_split_qkv_input": false, |
| "use_hook_mlp_in": false, |
| "use_attn_in": false, |
| "use_local_attn": false, |
| "ungroup_grouped_query_attention": false, |
| "original_architecture": "mingpt", |
| "from_checkpoint": false, |
| "checkpoint_index": null, |
| "checkpoint_label_type": null, |
| "checkpoint_value": null, |
| "tokenizer_name": null, |
| "window_size": null, |
| "attn_types": null, |
| "init_mode": "gpt2", |
| "normalization_type": "LN", |
| "device": "cpu", |
| "n_devices": 1, |
| "attention_dir": "causal", |
| "attn_only": false, |
| "seed": null, |
| "initializer_range": 0.035355339059327376, |
| "init_weights": false, |
| "scale_attn_by_inverse_layer_idx": false, |
| "positional_embedding_type": "standard", |
| "final_rms": false, |
| "d_vocab_out": 61, |
| "parallel_attn_mlp": false, |
| "rotary_dim": null, |
| "n_params": 25165824, |
| "use_hook_tokens": false, |
| "gated_mlp": false, |
| "default_prepend_bos": true, |
| "dtype": "float32", |
| "tokenizer_prepends_bos": null, |
| "n_key_value_heads": null, |
| "post_embedding_ln": false, |
| "rotary_base": 10000, |
| "trust_remote_code": false, |
| "rotary_adjacent_pairs": false, |
| "load_in_4bit": false, |
| "num_experts": null, |
| "experts_per_token": null, |
| "relative_attention_max_distance": null, |
| "relative_attention_num_buckets": null, |
| "decoder_start_token_id": null, |
| "tie_word_embeddings": false, |
| "use_normalization_before_and_after": false, |
| "attn_scores_soft_cap": -1.0, |
| "output_logits_soft_cap": -1.0, |
| "use_NTK_by_parts_rope": false, |
| "NTK_by_parts_low_freq_factor": 1.0, |
| "NTK_by_parts_high_freq_factor": 4.0, |
| "NTK_by_parts_factor": 8.0 |
| } |