Hazan-Lab
/

Transformer_500M

minitransformer

Model card Files Files and versions

Transformer_500M / config.json

yagizdevre's picture

transformer new

a2fbb2f about 1 year ago

history blame contribute delete

1.09 kB

	{
	"model_type": "minitransformer",
	"_name_or_path": "Transformer_500M",
	"architectures": ["MiniTransformer"],
	"dim": 768,
	"num_heads": 24,
	"num_layers": 27,
	"seq_len": 8192,
	"window_size": 8192,
	"vocab_size": 200064,
	"mlp_scale": 4,
	"bias": false,
	"dropout": 0.0,
	"num_epochs": 1,
	"global_bsz": 524288,
	"bsz": 1,
	"warmup_steps": 1907,
	"eval_period": 50,
	"save_period": 500,
	"max_lr": 3.0e-4,
	"min_lr": 3.0e-5,
	"max_norm": 1.0,
	"dilation": 1,
	"fsdp": false,
	"ddp": true,
	"mixed_precision": true,
	"torch_dtype": "bfloat16",
	"cpu_offload": false,
	"sharding_strategy": "full_shard",
	"state_dict_type": "full",
	"auto_wrap_policy": "partial",
	"backward_prefetch": "backward_pre",
	"forward_prefetch": false,
	"sync_module_states": true,
	"use_orig_params": true,
	"device_id": null,
	"precision": {
	"param": "bfloat16",
	"reduce": "bfloat16",
	"buffer": "bfloat16"
	},
	"fsdp_modules": [
	"AttentionLayer"
	],
	"use_activation_checkpointing": true,
	"softcap": 50.0,
	"theta": 10000.0,
	"torch_compile": true
	}