| { |
| "base_model_class": "Gemma4TextCausalLMProxy", |
| "code_files": [ |
| "/tmp/gemma4-hf-export-zxa_ushe/configuration_gemma4.py", |
| "/tmp/gemma4-hf-export-zxa_ushe/modeling_gemma4.py", |
| "/tmp/gemma4-hf-export-zxa_ushe/gemma4_optimization.py", |
| "/tmp/gemma4-hf-export-zxa_ushe/__init__.py" |
| ], |
| "max_shard_size": "5GB", |
| "model_class": "OptimizedGemma4ForCausalLM", |
| "output_dir": "/tmp/gemma4-hf-export-zxa_ushe", |
| "repo_id": "haysonC/gemma4-zero-compute", |
| "router_checkpoint": { |
| "config": { |
| "metadata": { |
| "resume_summary": { |
| "config": { |
| "metadata": { |
| "resume_summary": { |
| "config": { |
| "metadata": { |
| "resume_summary": { |
| "loaded": false, |
| "reason": "resume disabled" |
| }, |
| "step": 500, |
| "training_metrics": { |
| "current_lambda_zero_compute": 2.0, |
| "effective_batch_size": 16, |
| "entropy_loss": 4.538632531960806, |
| "entropy_term": 0.0, |
| "expert_usage_sample": [ |
| 1240.0, |
| 2139.0, |
| 1213.0, |
| 793.0, |
| 1992.0, |
| 1957.0, |
| 503.0, |
| 1528.0, |
| 1398.0, |
| 2252.0, |
| 1835.0, |
| 1146.0, |
| 1440.0, |
| 1527.0, |
| 1757.0, |
| 803.0 |
| ], |
| "grad_norm": 70.5, |
| "gradient_accumulation_steps": 8, |
| "lambda_entropy": 0.0, |
| "lambda_router": 1.0, |
| "lambda_zero_compute": 2.0, |
| "loss": 2.327640622854233, |
| "micro_batch_size": 2, |
| "output_kl": 1.1904816403985023, |
| "output_kl_term": 1.1904816403985023, |
| "probe_output_kl": 1.1182771921157837, |
| "probe_router_entropy": 4.540449047088623, |
| "probe_router_kl": 0.051853783428668976, |
| "probe_same_expert_ratio": 0.8435872395833334, |
| "probe_zero_compute_loss": 0.5133160352706909, |
| "probe_zero_compute_margin_gap": 0.41453187317432216, |
| "probe_zero_compute_mass": 0.013335910812020301, |
| "probe_zero_compute_top1_ratio": 0.00654296875, |
| "probe_zero_compute_topk_ratio": 0.29026692708333335, |
| "router_entropy": 4.538632531960806, |
| "router_kl": 0.05338437343016267, |
| "router_kl_term": 0.05338437343016267, |
| "same_expert_ratio": 0.8424682617187501, |
| "step": 500, |
| "tokens_per_optimizer_step": 8192, |
| "zero_compute_loss": 0.5418873056769371, |
| "zero_compute_margin_gap": 0.4672557485134652, |
| "zero_compute_mass": 0.012517090452214084, |
| "zero_compute_ramp_steps": 50, |
| "zero_compute_term": 1.0837746113538742, |
| "zero_compute_top1_ratio": 0.00439453125, |
| "zero_compute_topk_margin": 0.0, |
| "zero_compute_topk_ratio": 0.24339599609374998, |
| "zero_compute_warmup_steps": 0, |
| "zero_expert_usage": 1080.0 |
| } |
| }, |
| "model_config": { |
| "add_zero_compute_expert": true, |
| "num_experts": 128, |
| "top_k_experts": 8, |
| "use_zero_compute_optimization": true |
| }, |
| "num_router_keys": 90, |
| "router_keys_sample": [ |
| "model.layers.0.router.per_expert_scale", |
| "model.layers.0.router.proj.weight", |
| "model.layers.0.router.scale", |
| "model.layers.1.router.per_expert_scale", |
| "model.layers.1.router.proj.weight", |
| "model.layers.1.router.scale", |
| "model.layers.10.router.per_expert_scale", |
| "model.layers.10.router.proj.weight", |
| "model.layers.10.router.scale", |
| "model.layers.11.router.per_expert_scale", |
| "model.layers.11.router.proj.weight", |
| "model.layers.11.router.scale" |
| ], |
| "source_model_id": "" |
| }, |
| "config_path": "/cache/router_artifacts/router_config.json", |
| "loaded": true, |
| "loaded_key_count": 90, |
| "loaded_keys_sample": [ |
| "model.layers.0.router.per_expert_scale", |
| "model.layers.0.router.proj.weight", |
| "model.layers.0.router.scale", |
| "model.layers.1.router.per_expert_scale", |
| "model.layers.1.router.proj.weight", |
| "model.layers.1.router.scale", |
| "model.layers.10.router.per_expert_scale", |
| "model.layers.10.router.proj.weight", |
| "model.layers.10.router.scale", |
| "model.layers.11.router.per_expert_scale", |
| "model.layers.11.router.proj.weight", |
| "model.layers.11.router.scale" |
| ], |
| "path": "/cache/router_artifacts/router_state_dict.pt" |
| }, |
| "step": 100, |
| "training_metrics": { |
| "current_lambda_zero_compute": 3.0, |
| "effective_batch_size": 16, |
| "entropy_loss": 4.542559911807379, |
| "entropy_term": 0.0, |
| "expert_usage_sample": [ |
| 1239.0, |
| 2070.0, |
| 1500.0, |
| 1201.0, |
| 2009.0, |
| 1821.0, |
| 670.0, |
| 1778.0, |
| 1452.0, |
| 2154.0, |
| 2007.0, |
| 980.0, |
| 1320.0, |
| 1568.0, |
| 1522.0, |
| 700.0 |
| ], |
| "grad_norm": 446.0, |
| "gradient_accumulation_steps": 8, |
| "lambda_entropy": 0.0, |
| "lambda_router": 1.0, |
| "lambda_zero_compute": 3.0, |
| "loss": 2.6639687418937683, |
| "micro_batch_size": 2, |
| "output_kl": 0.9797117039561272, |
| "output_kl_term": 0.9797117039561272, |
| "probe_output_kl": 1.0213916301727295, |
| "probe_router_entropy": 4.537958733240763, |
| "probe_router_kl": 0.05228007212281227, |
| "probe_same_expert_ratio": 0.8512044270833333, |
| "probe_zero_compute_loss": 0.5116603970527649, |
| "probe_zero_compute_margin_gap": 0.37900154244465134, |
| "probe_zero_compute_mass": 0.014502804105480513, |
| "probe_zero_compute_top1_ratio": 0.02294921875, |
| "probe_zero_compute_topk_ratio": 0.3021158854166667, |
| "router_entropy": 4.542559911807379, |
| "router_kl": 0.05281998496502638, |
| "router_kl_term": 0.05281998496502638, |
| "same_expert_ratio": 0.85335693359375, |
| "step": 100, |
| "tokens_per_optimizer_step": 8192, |
| "zero_compute_loss": 0.5438123419880867, |
| "zero_compute_margin_gap": 0.4289153911076331, |
| "zero_compute_mass": 0.013749805480862657, |
| "zero_compute_ramp_steps": 50, |
| "zero_compute_term": 1.6314370036125183, |
| "zero_compute_top1_ratio": 0.019120279947916666, |
| "zero_compute_topk_margin": 0.0, |
| "zero_compute_topk_ratio": 0.26689453125, |
| "zero_compute_warmup_steps": 0, |
| "zero_expert_usage": 4699.0 |
| } |
| }, |
| "model_config": { |
| "add_zero_compute_expert": true, |
| "num_experts": 128, |
| "top_k_experts": 8, |
| "use_zero_compute_optimization": true |
| }, |
| "num_router_keys": 90, |
| "router_keys_sample": [ |
| "model.layers.0.router.per_expert_scale", |
| "model.layers.0.router.proj.weight", |
| "model.layers.0.router.scale", |
| "model.layers.1.router.per_expert_scale", |
| "model.layers.1.router.proj.weight", |
| "model.layers.1.router.scale", |
| "model.layers.10.router.per_expert_scale", |
| "model.layers.10.router.proj.weight", |
| "model.layers.10.router.scale", |
| "model.layers.11.router.per_expert_scale", |
| "model.layers.11.router.proj.weight", |
| "model.layers.11.router.scale" |
| ], |
| "source_model_id": "" |
| }, |
| "config_path": "/cache/router_artifacts/router_config.json", |
| "loaded": true, |
| "loaded_key_count": 90, |
| "loaded_keys_sample": [ |
| "model.layers.0.router.per_expert_scale", |
| "model.layers.0.router.proj.weight", |
| "model.layers.0.router.scale", |
| "model.layers.1.router.per_expert_scale", |
| "model.layers.1.router.proj.weight", |
| "model.layers.1.router.scale", |
| "model.layers.10.router.per_expert_scale", |
| "model.layers.10.router.proj.weight", |
| "model.layers.10.router.scale", |
| "model.layers.11.router.per_expert_scale", |
| "model.layers.11.router.proj.weight", |
| "model.layers.11.router.scale" |
| ], |
| "path": "/cache/router_artifacts/router_state_dict.pt" |
| }, |
| "step": 500, |
| "training_metrics": { |
| "current_lambda_zero_compute": 3.0, |
| "easy_token_ratio": 0.88720703125, |
| "effective_batch_size": 16, |
| "entropy_loss": 4.538989106814067, |
| "entropy_term": 0.009077978213628133, |
| "expert_usage_sample": [ |
| 1200.0, |
| 2028.0, |
| 1208.0, |
| 783.0, |
| 2040.0, |
| 1847.0, |
| 497.0, |
| 1490.0, |
| 1385.0, |
| 2217.0, |
| 1843.0, |
| 1175.0, |
| 1406.0, |
| 1502.0, |
| 1687.0, |
| 771.0 |
| ], |
| "grad_norm": 204.0, |
| "gradient_accumulation_steps": 8, |
| "lambda_entropy": 0.002, |
| "lambda_router": 1.0, |
| "lambda_zero_compute": 3.0, |
| "loss": 2.76311457157135, |
| "micro_batch_size": 2, |
| "output_kl": 1.1192611530423164, |
| "output_kl_term": 1.1192611530423164, |
| "probe_easy_token_ratio": 0.9248046875, |
| "probe_output_kl": 1.1542232036590576, |
| "probe_router_entropy": 4.538172864913941, |
| "probe_router_kl": 0.0544092059135437, |
| "probe_same_expert_ratio": 0.8352864583333334, |
| "probe_teacher_confidence_mean": 0.6827144622802734, |
| "probe_zero_compute_loss": 0.5005475282669067, |
| "probe_zero_compute_margin_gap": 0.3712589807061819, |
| "probe_zero_compute_mass": 0.01432527024565543, |
| "probe_zero_compute_token_weight_mean": 0.6357069611549377, |
| "probe_zero_compute_top1_hits_actual": 568.0, |
| "probe_zero_compute_top1_ratio": 0.017643229166666666, |
| "probe_zero_compute_top1_ratio_actual": 0.018489583333333334, |
| "probe_zero_compute_topk_hits_actual": 10002.0, |
| "probe_zero_compute_topk_ratio": 0.31997760956028976, |
| "probe_zero_compute_topk_ratio_actual": 0.0406982421875, |
| "router_entropy": 4.538989106814067, |
| "router_kl": 0.054629013407975435, |
| "router_kl_term": 0.054629013407975435, |
| "same_expert_ratio": 0.8434204101562499, |
| "step": 500, |
| "teacher_confidence_mean": 0.6225322559475899, |
| "tokens_per_optimizer_step": 8192, |
| "zero_compute_loss": 0.5267154797911644, |
| "zero_compute_margin_gap": 0.418523011850672, |
| "zero_compute_mass": 0.01356517664706988, |
| "zero_compute_ramp_steps": 50, |
| "zero_compute_term": 1.5801464468240738, |
| "zero_compute_token_weight_mean": 0.5501668378710747, |
| "zero_compute_top1_hits_actual": 459.0, |
| "zero_compute_top1_ratio": 0.014103190104166665, |
| "zero_compute_top1_ratio_actual": 0.014941406249999997, |
| "zero_compute_topk_hits_actual": 8916.125, |
| "zero_compute_topk_margin": 0.0, |
| "zero_compute_topk_ratio": 0.27892842232367093, |
| "zero_compute_topk_ratio_actual": 0.03627980550130208, |
| "zero_compute_warmup_steps": 0, |
| "zero_expert_usage": 3466.0 |
| } |
| }, |
| "model_config": { |
| "add_zero_compute_expert": true, |
| "num_experts": 128, |
| "top_k_experts": 8, |
| "use_zero_compute_optimization": true |
| }, |
| "num_router_keys": 90, |
| "router_keys_sample": [ |
| "model.layers.0.router.per_expert_scale", |
| "model.layers.0.router.proj.weight", |
| "model.layers.0.router.scale", |
| "model.layers.1.router.per_expert_scale", |
| "model.layers.1.router.proj.weight", |
| "model.layers.1.router.scale", |
| "model.layers.10.router.per_expert_scale", |
| "model.layers.10.router.proj.weight", |
| "model.layers.10.router.scale", |
| "model.layers.11.router.per_expert_scale", |
| "model.layers.11.router.proj.weight", |
| "model.layers.11.router.scale" |
| ], |
| "source_model_id": "" |
| }, |
| "config_path": "/cache/router_artifacts/router_config.json", |
| "loaded": true, |
| "loaded_key_count": 90, |
| "loaded_keys_sample": [ |
| "model.layers.0.router.per_expert_scale", |
| "model.layers.0.router.proj.weight", |
| "model.layers.0.router.scale", |
| "model.layers.1.router.per_expert_scale", |
| "model.layers.1.router.proj.weight", |
| "model.layers.1.router.scale", |
| "model.layers.10.router.per_expert_scale", |
| "model.layers.10.router.proj.weight", |
| "model.layers.10.router.scale", |
| "model.layers.11.router.per_expert_scale", |
| "model.layers.11.router.proj.weight", |
| "model.layers.11.router.scale" |
| ], |
| "path": "/cache/router_artifacts/router_state_dict.pt" |
| }, |
| "source_model_id": "google/gemma-4-26B-A4B-it", |
| "torch_dtype": "torch.bfloat16" |
| } |