{ "base_model_class": "Gemma4TextCausalLMProxy", "code_files": [ "/tmp/gemma4-hf-export-zxa_ushe/configuration_gemma4.py", "/tmp/gemma4-hf-export-zxa_ushe/modeling_gemma4.py", "/tmp/gemma4-hf-export-zxa_ushe/gemma4_optimization.py", "/tmp/gemma4-hf-export-zxa_ushe/__init__.py" ], "max_shard_size": "5GB", "model_class": "OptimizedGemma4ForCausalLM", "output_dir": "/tmp/gemma4-hf-export-zxa_ushe", "repo_id": "haysonC/gemma4-zero-compute", "router_checkpoint": { "config": { "metadata": { "resume_summary": { "config": { "metadata": { "resume_summary": { "config": { "metadata": { "resume_summary": { "loaded": false, "reason": "resume disabled" }, "step": 500, "training_metrics": { "current_lambda_zero_compute": 2.0, "effective_batch_size": 16, "entropy_loss": 4.538632531960806, "entropy_term": 0.0, "expert_usage_sample": [ 1240.0, 2139.0, 1213.0, 793.0, 1992.0, 1957.0, 503.0, 1528.0, 1398.0, 2252.0, 1835.0, 1146.0, 1440.0, 1527.0, 1757.0, 803.0 ], "grad_norm": 70.5, "gradient_accumulation_steps": 8, "lambda_entropy": 0.0, "lambda_router": 1.0, "lambda_zero_compute": 2.0, "loss": 2.327640622854233, "micro_batch_size": 2, "output_kl": 1.1904816403985023, "output_kl_term": 1.1904816403985023, "probe_output_kl": 1.1182771921157837, "probe_router_entropy": 4.540449047088623, "probe_router_kl": 0.051853783428668976, "probe_same_expert_ratio": 0.8435872395833334, "probe_zero_compute_loss": 0.5133160352706909, "probe_zero_compute_margin_gap": 0.41453187317432216, "probe_zero_compute_mass": 0.013335910812020301, "probe_zero_compute_top1_ratio": 0.00654296875, "probe_zero_compute_topk_ratio": 0.29026692708333335, "router_entropy": 4.538632531960806, "router_kl": 0.05338437343016267, "router_kl_term": 0.05338437343016267, "same_expert_ratio": 0.8424682617187501, "step": 500, "tokens_per_optimizer_step": 8192, "zero_compute_loss": 0.5418873056769371, "zero_compute_margin_gap": 0.4672557485134652, "zero_compute_mass": 0.012517090452214084, "zero_compute_ramp_steps": 50, "zero_compute_term": 1.0837746113538742, "zero_compute_top1_ratio": 0.00439453125, "zero_compute_topk_margin": 0.0, "zero_compute_topk_ratio": 0.24339599609374998, "zero_compute_warmup_steps": 0, "zero_expert_usage": 1080.0 } }, "model_config": { "add_zero_compute_expert": true, "num_experts": 128, "top_k_experts": 8, "use_zero_compute_optimization": true }, "num_router_keys": 90, "router_keys_sample": [ "model.layers.0.router.per_expert_scale", "model.layers.0.router.proj.weight", "model.layers.0.router.scale", "model.layers.1.router.per_expert_scale", "model.layers.1.router.proj.weight", "model.layers.1.router.scale", "model.layers.10.router.per_expert_scale", "model.layers.10.router.proj.weight", "model.layers.10.router.scale", "model.layers.11.router.per_expert_scale", "model.layers.11.router.proj.weight", "model.layers.11.router.scale" ], "source_model_id": "" }, "config_path": "/cache/router_artifacts/router_config.json", "loaded": true, "loaded_key_count": 90, "loaded_keys_sample": [ "model.layers.0.router.per_expert_scale", "model.layers.0.router.proj.weight", "model.layers.0.router.scale", "model.layers.1.router.per_expert_scale", "model.layers.1.router.proj.weight", "model.layers.1.router.scale", "model.layers.10.router.per_expert_scale", "model.layers.10.router.proj.weight", "model.layers.10.router.scale", "model.layers.11.router.per_expert_scale", "model.layers.11.router.proj.weight", "model.layers.11.router.scale" ], "path": "/cache/router_artifacts/router_state_dict.pt" }, "step": 100, "training_metrics": { "current_lambda_zero_compute": 3.0, "effective_batch_size": 16, "entropy_loss": 4.542559911807379, "entropy_term": 0.0, "expert_usage_sample": [ 1239.0, 2070.0, 1500.0, 1201.0, 2009.0, 1821.0, 670.0, 1778.0, 1452.0, 2154.0, 2007.0, 980.0, 1320.0, 1568.0, 1522.0, 700.0 ], "grad_norm": 446.0, "gradient_accumulation_steps": 8, "lambda_entropy": 0.0, "lambda_router": 1.0, "lambda_zero_compute": 3.0, "loss": 2.6639687418937683, "micro_batch_size": 2, "output_kl": 0.9797117039561272, "output_kl_term": 0.9797117039561272, "probe_output_kl": 1.0213916301727295, "probe_router_entropy": 4.537958733240763, "probe_router_kl": 0.05228007212281227, "probe_same_expert_ratio": 0.8512044270833333, "probe_zero_compute_loss": 0.5116603970527649, "probe_zero_compute_margin_gap": 0.37900154244465134, "probe_zero_compute_mass": 0.014502804105480513, "probe_zero_compute_top1_ratio": 0.02294921875, "probe_zero_compute_topk_ratio": 0.3021158854166667, "router_entropy": 4.542559911807379, "router_kl": 0.05281998496502638, "router_kl_term": 0.05281998496502638, "same_expert_ratio": 0.85335693359375, "step": 100, "tokens_per_optimizer_step": 8192, "zero_compute_loss": 0.5438123419880867, "zero_compute_margin_gap": 0.4289153911076331, "zero_compute_mass": 0.013749805480862657, "zero_compute_ramp_steps": 50, "zero_compute_term": 1.6314370036125183, "zero_compute_top1_ratio": 0.019120279947916666, "zero_compute_topk_margin": 0.0, "zero_compute_topk_ratio": 0.26689453125, "zero_compute_warmup_steps": 0, "zero_expert_usage": 4699.0 } }, "model_config": { "add_zero_compute_expert": true, "num_experts": 128, "top_k_experts": 8, "use_zero_compute_optimization": true }, "num_router_keys": 90, "router_keys_sample": [ "model.layers.0.router.per_expert_scale", "model.layers.0.router.proj.weight", "model.layers.0.router.scale", "model.layers.1.router.per_expert_scale", "model.layers.1.router.proj.weight", "model.layers.1.router.scale", "model.layers.10.router.per_expert_scale", "model.layers.10.router.proj.weight", "model.layers.10.router.scale", "model.layers.11.router.per_expert_scale", "model.layers.11.router.proj.weight", "model.layers.11.router.scale" ], "source_model_id": "" }, "config_path": "/cache/router_artifacts/router_config.json", "loaded": true, "loaded_key_count": 90, "loaded_keys_sample": [ "model.layers.0.router.per_expert_scale", "model.layers.0.router.proj.weight", "model.layers.0.router.scale", "model.layers.1.router.per_expert_scale", "model.layers.1.router.proj.weight", "model.layers.1.router.scale", "model.layers.10.router.per_expert_scale", "model.layers.10.router.proj.weight", "model.layers.10.router.scale", "model.layers.11.router.per_expert_scale", "model.layers.11.router.proj.weight", "model.layers.11.router.scale" ], "path": "/cache/router_artifacts/router_state_dict.pt" }, "step": 500, "training_metrics": { "current_lambda_zero_compute": 3.0, "easy_token_ratio": 0.88720703125, "effective_batch_size": 16, "entropy_loss": 4.538989106814067, "entropy_term": 0.009077978213628133, "expert_usage_sample": [ 1200.0, 2028.0, 1208.0, 783.0, 2040.0, 1847.0, 497.0, 1490.0, 1385.0, 2217.0, 1843.0, 1175.0, 1406.0, 1502.0, 1687.0, 771.0 ], "grad_norm": 204.0, "gradient_accumulation_steps": 8, "lambda_entropy": 0.002, "lambda_router": 1.0, "lambda_zero_compute": 3.0, "loss": 2.76311457157135, "micro_batch_size": 2, "output_kl": 1.1192611530423164, "output_kl_term": 1.1192611530423164, "probe_easy_token_ratio": 0.9248046875, "probe_output_kl": 1.1542232036590576, "probe_router_entropy": 4.538172864913941, "probe_router_kl": 0.0544092059135437, "probe_same_expert_ratio": 0.8352864583333334, "probe_teacher_confidence_mean": 0.6827144622802734, "probe_zero_compute_loss": 0.5005475282669067, "probe_zero_compute_margin_gap": 0.3712589807061819, "probe_zero_compute_mass": 0.01432527024565543, "probe_zero_compute_token_weight_mean": 0.6357069611549377, "probe_zero_compute_top1_hits_actual": 568.0, "probe_zero_compute_top1_ratio": 0.017643229166666666, "probe_zero_compute_top1_ratio_actual": 0.018489583333333334, "probe_zero_compute_topk_hits_actual": 10002.0, "probe_zero_compute_topk_ratio": 0.31997760956028976, "probe_zero_compute_topk_ratio_actual": 0.0406982421875, "router_entropy": 4.538989106814067, "router_kl": 0.054629013407975435, "router_kl_term": 0.054629013407975435, "same_expert_ratio": 0.8434204101562499, "step": 500, "teacher_confidence_mean": 0.6225322559475899, "tokens_per_optimizer_step": 8192, "zero_compute_loss": 0.5267154797911644, "zero_compute_margin_gap": 0.418523011850672, "zero_compute_mass": 0.01356517664706988, "zero_compute_ramp_steps": 50, "zero_compute_term": 1.5801464468240738, "zero_compute_token_weight_mean": 0.5501668378710747, "zero_compute_top1_hits_actual": 459.0, "zero_compute_top1_ratio": 0.014103190104166665, "zero_compute_top1_ratio_actual": 0.014941406249999997, "zero_compute_topk_hits_actual": 8916.125, "zero_compute_topk_margin": 0.0, "zero_compute_topk_ratio": 0.27892842232367093, "zero_compute_topk_ratio_actual": 0.03627980550130208, "zero_compute_warmup_steps": 0, "zero_expert_usage": 3466.0 } }, "model_config": { "add_zero_compute_expert": true, "num_experts": 128, "top_k_experts": 8, "use_zero_compute_optimization": true }, "num_router_keys": 90, "router_keys_sample": [ "model.layers.0.router.per_expert_scale", "model.layers.0.router.proj.weight", "model.layers.0.router.scale", "model.layers.1.router.per_expert_scale", "model.layers.1.router.proj.weight", "model.layers.1.router.scale", "model.layers.10.router.per_expert_scale", "model.layers.10.router.proj.weight", "model.layers.10.router.scale", "model.layers.11.router.per_expert_scale", "model.layers.11.router.proj.weight", "model.layers.11.router.scale" ], "source_model_id": "" }, "config_path": "/cache/router_artifacts/router_config.json", "loaded": true, "loaded_key_count": 90, "loaded_keys_sample": [ "model.layers.0.router.per_expert_scale", "model.layers.0.router.proj.weight", "model.layers.0.router.scale", "model.layers.1.router.per_expert_scale", "model.layers.1.router.proj.weight", "model.layers.1.router.scale", "model.layers.10.router.per_expert_scale", "model.layers.10.router.proj.weight", "model.layers.10.router.scale", "model.layers.11.router.per_expert_scale", "model.layers.11.router.proj.weight", "model.layers.11.router.scale" ], "path": "/cache/router_artifacts/router_state_dict.pt" }, "source_model_id": "google/gemma-4-26B-A4B-it", "torch_dtype": "torch.bfloat16" }