| - Training Parameters: | |
| - base_model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B | |
| - save_ckpt_log_name: DeepSeek-R1-Distill-Llama-8B_s0.10_channel | |
| - pruning_ratio: 0.1 | |
| - pruner_type: taylor | |
| - temperature: 1.0 | |
| - top_p: 0.95 | |
| - max_seq_len: 2048 | |
| - channel_wise: True | |
| - block_wise: False | |
| - layer_wise: False | |
| - layer: 12 | |
| - block_attention_layer_start: 3 | |
| - block_attention_layer_end: 31 | |
| - block_mlp_layer_start: 3 | |
| - block_mlp_layer_end: 31 | |
| - iterative_steps: 1 | |
| - grouping_strategy: sum | |
| - global_pruning: False | |
| - taylor: param_first | |
| - num_examples: 10 | |
| - device: cpu | |
| - test_before_train: False | |
| - eval_device: cuda | |
| - test_after_train: False | |
| - seed: 42 | |
| - save_model: True | |
| - torch_version: 2.3 | |