- Training Parameters: 
  - base_model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
  - save_ckpt_log_name: DeepSeek-R1-Distill-Llama-8B_s0.10_channel
  - pruning_ratio: 0.1
  - pruner_type: taylor
  - temperature: 1.0
  - top_p: 0.95
  - max_seq_len: 2048
  - channel_wise: True
  - block_wise: False
  - layer_wise: False
  - layer: 12
  - block_attention_layer_start: 3
  - block_attention_layer_end: 31
  - block_mlp_layer_start: 3
  - block_mlp_layer_end: 31
  - iterative_steps: 1
  - grouping_strategy: sum
  - global_pruning: False
  - taylor: param_first
  - num_examples: 10
  - device: cpu
  - test_before_train: False
  - eval_device: cuda
  - test_after_train: False
  - seed: 42
  - save_model: True
  - torch_version: 2.3