Delta-Vector
/

distill-m-6a3lnzvb-code

Model card Files Files and versions

distill-m-6a3lnzvb-code / sn97_text35_warm.toml

Delta-Vector's picture

Upload folder using huggingface_hub

2d38ae8 verified 8 days ago

history blame contribute delete

1.42 kB

	[model]
	teacher = "../models/Qwen3.5-35B-A3B"
	student = "../models/sn97-text"
	tokenizer = "../models/Qwen3.5-35B-A3B"
	student_device = "cuda:7"
	teacher_devices = [0, 1, 2, 3, 4, 5]
	teacher_max_memory_gb = 70

	[data]
	dataset = "karpathy/climbmix-400b-shuffle"
	text_field = "text"
	min_chars = 768
	max_seq_len = 768
	kl_start_pos = 64
	seed = 6767
	shuffle_buffer = 10000

	[train]
	seed = 6767
	lr = 1.0e-7
	schedule = "cosine"
	warmup_steps = 20
	weight_decay = 0.0
	grad_clip = 1.0
	betas = [0.9, 0.999]
	eps = 1.0e-3
	samples_per_step = 2
	micro_batch_size = 1
	max_steps = 20
	grad_checkpointing = true
	attn_implementation = "flash_attention_2"
	student_dtype = "bfloat16"
	teacher_dtype = "bfloat16"
	kl_chunk_size = 128
	new_layer_lr_mul = 5.0
	trainable_patterns = ["^model\\.layers\\.(32\|33\|34)\\.", "^lm_head", "^model\\.norm"]

	[eval]
	every_steps = 5
	samples = 32
	seed = 4242
	cache_path = "./cache/sn97_eval_32.pt"

	[log]
	wandb = false
	wandb_project = "distil-subnet97"
	wandb_run = "sn97_text35_warm"
	log_every = 1
	output_dir = "./out/sn97_text35_warm"
	experiment_log = "./out/experiments.jsonl"

	[init]
	zero_layers = []
	target_num_layers = 35