gemma4-zero-compute / export_summary.json
haysonC's picture
Upload optimized Gemma4 checkpoint
0b0ec56 verified
{
"base_model_class": "Gemma4TextCausalLMProxy",
"code_files": [
"/tmp/gemma4-hf-export-zxa_ushe/configuration_gemma4.py",
"/tmp/gemma4-hf-export-zxa_ushe/modeling_gemma4.py",
"/tmp/gemma4-hf-export-zxa_ushe/gemma4_optimization.py",
"/tmp/gemma4-hf-export-zxa_ushe/__init__.py"
],
"max_shard_size": "5GB",
"model_class": "OptimizedGemma4ForCausalLM",
"output_dir": "/tmp/gemma4-hf-export-zxa_ushe",
"repo_id": "haysonC/gemma4-zero-compute",
"router_checkpoint": {
"config": {
"metadata": {
"resume_summary": {
"config": {
"metadata": {
"resume_summary": {
"config": {
"metadata": {
"resume_summary": {
"loaded": false,
"reason": "resume disabled"
},
"step": 500,
"training_metrics": {
"current_lambda_zero_compute": 2.0,
"effective_batch_size": 16,
"entropy_loss": 4.538632531960806,
"entropy_term": 0.0,
"expert_usage_sample": [
1240.0,
2139.0,
1213.0,
793.0,
1992.0,
1957.0,
503.0,
1528.0,
1398.0,
2252.0,
1835.0,
1146.0,
1440.0,
1527.0,
1757.0,
803.0
],
"grad_norm": 70.5,
"gradient_accumulation_steps": 8,
"lambda_entropy": 0.0,
"lambda_router": 1.0,
"lambda_zero_compute": 2.0,
"loss": 2.327640622854233,
"micro_batch_size": 2,
"output_kl": 1.1904816403985023,
"output_kl_term": 1.1904816403985023,
"probe_output_kl": 1.1182771921157837,
"probe_router_entropy": 4.540449047088623,
"probe_router_kl": 0.051853783428668976,
"probe_same_expert_ratio": 0.8435872395833334,
"probe_zero_compute_loss": 0.5133160352706909,
"probe_zero_compute_margin_gap": 0.41453187317432216,
"probe_zero_compute_mass": 0.013335910812020301,
"probe_zero_compute_top1_ratio": 0.00654296875,
"probe_zero_compute_topk_ratio": 0.29026692708333335,
"router_entropy": 4.538632531960806,
"router_kl": 0.05338437343016267,
"router_kl_term": 0.05338437343016267,
"same_expert_ratio": 0.8424682617187501,
"step": 500,
"tokens_per_optimizer_step": 8192,
"zero_compute_loss": 0.5418873056769371,
"zero_compute_margin_gap": 0.4672557485134652,
"zero_compute_mass": 0.012517090452214084,
"zero_compute_ramp_steps": 50,
"zero_compute_term": 1.0837746113538742,
"zero_compute_top1_ratio": 0.00439453125,
"zero_compute_topk_margin": 0.0,
"zero_compute_topk_ratio": 0.24339599609374998,
"zero_compute_warmup_steps": 0,
"zero_expert_usage": 1080.0
}
},
"model_config": {
"add_zero_compute_expert": true,
"num_experts": 128,
"top_k_experts": 8,
"use_zero_compute_optimization": true
},
"num_router_keys": 90,
"router_keys_sample": [
"model.layers.0.router.per_expert_scale",
"model.layers.0.router.proj.weight",
"model.layers.0.router.scale",
"model.layers.1.router.per_expert_scale",
"model.layers.1.router.proj.weight",
"model.layers.1.router.scale",
"model.layers.10.router.per_expert_scale",
"model.layers.10.router.proj.weight",
"model.layers.10.router.scale",
"model.layers.11.router.per_expert_scale",
"model.layers.11.router.proj.weight",
"model.layers.11.router.scale"
],
"source_model_id": ""
},
"config_path": "/cache/router_artifacts/router_config.json",
"loaded": true,
"loaded_key_count": 90,
"loaded_keys_sample": [
"model.layers.0.router.per_expert_scale",
"model.layers.0.router.proj.weight",
"model.layers.0.router.scale",
"model.layers.1.router.per_expert_scale",
"model.layers.1.router.proj.weight",
"model.layers.1.router.scale",
"model.layers.10.router.per_expert_scale",
"model.layers.10.router.proj.weight",
"model.layers.10.router.scale",
"model.layers.11.router.per_expert_scale",
"model.layers.11.router.proj.weight",
"model.layers.11.router.scale"
],
"path": "/cache/router_artifacts/router_state_dict.pt"
},
"step": 100,
"training_metrics": {
"current_lambda_zero_compute": 3.0,
"effective_batch_size": 16,
"entropy_loss": 4.542559911807379,
"entropy_term": 0.0,
"expert_usage_sample": [
1239.0,
2070.0,
1500.0,
1201.0,
2009.0,
1821.0,
670.0,
1778.0,
1452.0,
2154.0,
2007.0,
980.0,
1320.0,
1568.0,
1522.0,
700.0
],
"grad_norm": 446.0,
"gradient_accumulation_steps": 8,
"lambda_entropy": 0.0,
"lambda_router": 1.0,
"lambda_zero_compute": 3.0,
"loss": 2.6639687418937683,
"micro_batch_size": 2,
"output_kl": 0.9797117039561272,
"output_kl_term": 0.9797117039561272,
"probe_output_kl": 1.0213916301727295,
"probe_router_entropy": 4.537958733240763,
"probe_router_kl": 0.05228007212281227,
"probe_same_expert_ratio": 0.8512044270833333,
"probe_zero_compute_loss": 0.5116603970527649,
"probe_zero_compute_margin_gap": 0.37900154244465134,
"probe_zero_compute_mass": 0.014502804105480513,
"probe_zero_compute_top1_ratio": 0.02294921875,
"probe_zero_compute_topk_ratio": 0.3021158854166667,
"router_entropy": 4.542559911807379,
"router_kl": 0.05281998496502638,
"router_kl_term": 0.05281998496502638,
"same_expert_ratio": 0.85335693359375,
"step": 100,
"tokens_per_optimizer_step": 8192,
"zero_compute_loss": 0.5438123419880867,
"zero_compute_margin_gap": 0.4289153911076331,
"zero_compute_mass": 0.013749805480862657,
"zero_compute_ramp_steps": 50,
"zero_compute_term": 1.6314370036125183,
"zero_compute_top1_ratio": 0.019120279947916666,
"zero_compute_topk_margin": 0.0,
"zero_compute_topk_ratio": 0.26689453125,
"zero_compute_warmup_steps": 0,
"zero_expert_usage": 4699.0
}
},
"model_config": {
"add_zero_compute_expert": true,
"num_experts": 128,
"top_k_experts": 8,
"use_zero_compute_optimization": true
},
"num_router_keys": 90,
"router_keys_sample": [
"model.layers.0.router.per_expert_scale",
"model.layers.0.router.proj.weight",
"model.layers.0.router.scale",
"model.layers.1.router.per_expert_scale",
"model.layers.1.router.proj.weight",
"model.layers.1.router.scale",
"model.layers.10.router.per_expert_scale",
"model.layers.10.router.proj.weight",
"model.layers.10.router.scale",
"model.layers.11.router.per_expert_scale",
"model.layers.11.router.proj.weight",
"model.layers.11.router.scale"
],
"source_model_id": ""
},
"config_path": "/cache/router_artifacts/router_config.json",
"loaded": true,
"loaded_key_count": 90,
"loaded_keys_sample": [
"model.layers.0.router.per_expert_scale",
"model.layers.0.router.proj.weight",
"model.layers.0.router.scale",
"model.layers.1.router.per_expert_scale",
"model.layers.1.router.proj.weight",
"model.layers.1.router.scale",
"model.layers.10.router.per_expert_scale",
"model.layers.10.router.proj.weight",
"model.layers.10.router.scale",
"model.layers.11.router.per_expert_scale",
"model.layers.11.router.proj.weight",
"model.layers.11.router.scale"
],
"path": "/cache/router_artifacts/router_state_dict.pt"
},
"step": 500,
"training_metrics": {
"current_lambda_zero_compute": 3.0,
"easy_token_ratio": 0.88720703125,
"effective_batch_size": 16,
"entropy_loss": 4.538989106814067,
"entropy_term": 0.009077978213628133,
"expert_usage_sample": [
1200.0,
2028.0,
1208.0,
783.0,
2040.0,
1847.0,
497.0,
1490.0,
1385.0,
2217.0,
1843.0,
1175.0,
1406.0,
1502.0,
1687.0,
771.0
],
"grad_norm": 204.0,
"gradient_accumulation_steps": 8,
"lambda_entropy": 0.002,
"lambda_router": 1.0,
"lambda_zero_compute": 3.0,
"loss": 2.76311457157135,
"micro_batch_size": 2,
"output_kl": 1.1192611530423164,
"output_kl_term": 1.1192611530423164,
"probe_easy_token_ratio": 0.9248046875,
"probe_output_kl": 1.1542232036590576,
"probe_router_entropy": 4.538172864913941,
"probe_router_kl": 0.0544092059135437,
"probe_same_expert_ratio": 0.8352864583333334,
"probe_teacher_confidence_mean": 0.6827144622802734,
"probe_zero_compute_loss": 0.5005475282669067,
"probe_zero_compute_margin_gap": 0.3712589807061819,
"probe_zero_compute_mass": 0.01432527024565543,
"probe_zero_compute_token_weight_mean": 0.6357069611549377,
"probe_zero_compute_top1_hits_actual": 568.0,
"probe_zero_compute_top1_ratio": 0.017643229166666666,
"probe_zero_compute_top1_ratio_actual": 0.018489583333333334,
"probe_zero_compute_topk_hits_actual": 10002.0,
"probe_zero_compute_topk_ratio": 0.31997760956028976,
"probe_zero_compute_topk_ratio_actual": 0.0406982421875,
"router_entropy": 4.538989106814067,
"router_kl": 0.054629013407975435,
"router_kl_term": 0.054629013407975435,
"same_expert_ratio": 0.8434204101562499,
"step": 500,
"teacher_confidence_mean": 0.6225322559475899,
"tokens_per_optimizer_step": 8192,
"zero_compute_loss": 0.5267154797911644,
"zero_compute_margin_gap": 0.418523011850672,
"zero_compute_mass": 0.01356517664706988,
"zero_compute_ramp_steps": 50,
"zero_compute_term": 1.5801464468240738,
"zero_compute_token_weight_mean": 0.5501668378710747,
"zero_compute_top1_hits_actual": 459.0,
"zero_compute_top1_ratio": 0.014103190104166665,
"zero_compute_top1_ratio_actual": 0.014941406249999997,
"zero_compute_topk_hits_actual": 8916.125,
"zero_compute_topk_margin": 0.0,
"zero_compute_topk_ratio": 0.27892842232367093,
"zero_compute_topk_ratio_actual": 0.03627980550130208,
"zero_compute_warmup_steps": 0,
"zero_expert_usage": 3466.0
}
},
"model_config": {
"add_zero_compute_expert": true,
"num_experts": 128,
"top_k_experts": 8,
"use_zero_compute_optimization": true
},
"num_router_keys": 90,
"router_keys_sample": [
"model.layers.0.router.per_expert_scale",
"model.layers.0.router.proj.weight",
"model.layers.0.router.scale",
"model.layers.1.router.per_expert_scale",
"model.layers.1.router.proj.weight",
"model.layers.1.router.scale",
"model.layers.10.router.per_expert_scale",
"model.layers.10.router.proj.weight",
"model.layers.10.router.scale",
"model.layers.11.router.per_expert_scale",
"model.layers.11.router.proj.weight",
"model.layers.11.router.scale"
],
"source_model_id": ""
},
"config_path": "/cache/router_artifacts/router_config.json",
"loaded": true,
"loaded_key_count": 90,
"loaded_keys_sample": [
"model.layers.0.router.per_expert_scale",
"model.layers.0.router.proj.weight",
"model.layers.0.router.scale",
"model.layers.1.router.per_expert_scale",
"model.layers.1.router.proj.weight",
"model.layers.1.router.scale",
"model.layers.10.router.per_expert_scale",
"model.layers.10.router.proj.weight",
"model.layers.10.router.scale",
"model.layers.11.router.per_expert_scale",
"model.layers.11.router.proj.weight",
"model.layers.11.router.scale"
],
"path": "/cache/router_artifacts/router_state_dict.pt"
},
"source_model_id": "google/gemma-4-26B-A4B-it",
"torch_dtype": "torch.bfloat16"
}