phyjudge-9B / training_args.json

Strip base-model identity from README and training_args.json (kept only in adapter_config.json where PEFT requires it)

4b2c01e verified 15 days ago

1.38 kB

	{
	"_comment": "Sanitized excerpt of the training configuration. Local paths, tracking IDs, and base-model identity removed (see adapter_config.json for the base model required by PEFT).",
	"task_type": "causal_lm",
	"torch_dtype": "bfloat16",
	"max_length": 8192,
	"max_new_tokens": 64,
	"tuner": {
	"type": "lora",
	"lora_rank": 32,
	"lora_alpha": 64,
	"lora_dropout": 0.05,
	"lora_bias": "none",
	"target_modules": "all-linear (language model only; vision merger limited to linear_fc1/linear_fc2)",
	"use_dora": false,
	"use_rslora": false,
	"freeze_vit": true,
	"freeze_aligner": false
	},
	"optimizer": {
	"name": "adamw_torch_fused",
	"learning_rate": 1e-4,
	"weight_decay": 0.1,
	"adam_beta1": 0.9,
	"adam_beta2": 0.95,
	"adam_epsilon": 1e-8,
	"max_grad_norm": 1.0,
	"lr_scheduler_type": "cosine",
	"warmup_ratio": 0.05,
	"aligner_lr": 2e-6
	},
	"training": {
	"num_train_epochs": 1.0,
	"per_device_train_batch_size": 1,
	"gradient_accumulation_steps": 8,
	"world_size": 4,
	"global_batch_size": 32,
	"bf16": true,
	"gradient_checkpointing": true,
	"seed": 42,
	"data_seed": 42,
	"deepspeed_zero_stage": 2,
	"total_steps": 294,
	"best_eval_loss": 0.1063,
	"best_step": 294
	},
	"framework": {
	"ms_swift_version": "4.1.2",
	"peft_version": "0.19.1"
	}
	}