- Training Parameters: - base_model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B - save_ckpt_log_name: DeepSeek-R1-Distill-Llama-8B_s0.10_channel - pruning_ratio: 0.1 - pruner_type: taylor - temperature: 1.0 - top_p: 0.95 - max_seq_len: 2048 - channel_wise: True - block_wise: False - layer_wise: False - layer: 12 - block_attention_layer_start: 3 - block_attention_layer_end: 31 - block_mlp_layer_start: 3 - block_mlp_layer_end: 31 - iterative_steps: 1 - grouping_strategy: sum - global_pruning: False - taylor: param_first - num_examples: 10 - device: cpu - test_before_train: False - eval_device: cuda - test_after_train: False - seed: 42 - save_model: True - torch_version: 2.3