Strip base-model identity from README and training_args.json (kept only in adapter_config.json where PEFT requires it)
4b2c01e verified | { | |
| "_comment": "Sanitized excerpt of the training configuration. Local paths, tracking IDs, and base-model identity removed (see adapter_config.json for the base model required by PEFT).", | |
| "task_type": "causal_lm", | |
| "torch_dtype": "bfloat16", | |
| "max_length": 8192, | |
| "max_new_tokens": 64, | |
| "tuner": { | |
| "type": "lora", | |
| "lora_rank": 32, | |
| "lora_alpha": 64, | |
| "lora_dropout": 0.05, | |
| "lora_bias": "none", | |
| "target_modules": "all-linear (language model only; vision merger limited to linear_fc1/linear_fc2)", | |
| "use_dora": false, | |
| "use_rslora": false, | |
| "freeze_vit": true, | |
| "freeze_aligner": false | |
| }, | |
| "optimizer": { | |
| "name": "adamw_torch_fused", | |
| "learning_rate": 1e-4, | |
| "weight_decay": 0.1, | |
| "adam_beta1": 0.9, | |
| "adam_beta2": 0.95, | |
| "adam_epsilon": 1e-8, | |
| "max_grad_norm": 1.0, | |
| "lr_scheduler_type": "cosine", | |
| "warmup_ratio": 0.05, | |
| "aligner_lr": 2e-6 | |
| }, | |
| "training": { | |
| "num_train_epochs": 1.0, | |
| "per_device_train_batch_size": 1, | |
| "gradient_accumulation_steps": 8, | |
| "world_size": 4, | |
| "global_batch_size": 32, | |
| "bf16": true, | |
| "gradient_checkpointing": true, | |
| "seed": 42, | |
| "data_seed": 42, | |
| "deepspeed_zero_stage": 2, | |
| "total_steps": 294, | |
| "best_eval_loss": 0.1063, | |
| "best_step": 294 | |
| }, | |
| "framework": { | |
| "ms_swift_version": "4.1.2", | |
| "peft_version": "0.19.1" | |
| } | |
| } | |