gemma4-zero-compute / export_summary.json

Upload optimized Gemma4 checkpoint

0b0ec56 verified 8 days ago

14.9 kB

	{
	"base_model_class": "Gemma4TextCausalLMProxy",
	"code_files": [
	"/tmp/gemma4-hf-export-zxa_ushe/configuration_gemma4.py",
	"/tmp/gemma4-hf-export-zxa_ushe/modeling_gemma4.py",
	"/tmp/gemma4-hf-export-zxa_ushe/gemma4_optimization.py",
	"/tmp/gemma4-hf-export-zxa_ushe/__init__.py"
	],
	"max_shard_size": "5GB",
	"model_class": "OptimizedGemma4ForCausalLM",
	"output_dir": "/tmp/gemma4-hf-export-zxa_ushe",
	"repo_id": "haysonC/gemma4-zero-compute",
	"router_checkpoint": {
	"config": {
	"metadata": {
	"resume_summary": {
	"config": {
	"metadata": {
	"resume_summary": {
	"config": {
	"metadata": {
	"resume_summary": {
	"loaded": false,
	"reason": "resume disabled"
	},
	"step": 500,
	"training_metrics": {
	"current_lambda_zero_compute": 2.0,
	"effective_batch_size": 16,
	"entropy_loss": 4.538632531960806,
	"entropy_term": 0.0,
	"expert_usage_sample": [
	1240.0,
	2139.0,
	1213.0,
	793.0,
	1992.0,
	1957.0,
	503.0,
	1528.0,
	1398.0,
	2252.0,
	1835.0,
	1146.0,
	1440.0,
	1527.0,
	1757.0,
	803.0
	],
	"grad_norm": 70.5,
	"gradient_accumulation_steps": 8,
	"lambda_entropy": 0.0,
	"lambda_router": 1.0,
	"lambda_zero_compute": 2.0,
	"loss": 2.327640622854233,
	"micro_batch_size": 2,
	"output_kl": 1.1904816403985023,
	"output_kl_term": 1.1904816403985023,
	"probe_output_kl": 1.1182771921157837,
	"probe_router_entropy": 4.540449047088623,
	"probe_router_kl": 0.051853783428668976,
	"probe_same_expert_ratio": 0.8435872395833334,
	"probe_zero_compute_loss": 0.5133160352706909,
	"probe_zero_compute_margin_gap": 0.41453187317432216,
	"probe_zero_compute_mass": 0.013335910812020301,
	"probe_zero_compute_top1_ratio": 0.00654296875,
	"probe_zero_compute_topk_ratio": 0.29026692708333335,
	"router_entropy": 4.538632531960806,
	"router_kl": 0.05338437343016267,
	"router_kl_term": 0.05338437343016267,
	"same_expert_ratio": 0.8424682617187501,
	"step": 500,
	"tokens_per_optimizer_step": 8192,
	"zero_compute_loss": 0.5418873056769371,
	"zero_compute_margin_gap": 0.4672557485134652,
	"zero_compute_mass": 0.012517090452214084,
	"zero_compute_ramp_steps": 50,
	"zero_compute_term": 1.0837746113538742,
	"zero_compute_top1_ratio": 0.00439453125,
	"zero_compute_topk_margin": 0.0,
	"zero_compute_topk_ratio": 0.24339599609374998,
	"zero_compute_warmup_steps": 0,
	"zero_expert_usage": 1080.0
	}
	},
	"model_config": {
	"add_zero_compute_expert": true,
	"num_experts": 128,
	"top_k_experts": 8,
	"use_zero_compute_optimization": true
	},
	"num_router_keys": 90,
	"router_keys_sample": [
	"model.layers.0.router.per_expert_scale",
	"model.layers.0.router.proj.weight",
	"model.layers.0.router.scale",
	"model.layers.1.router.per_expert_scale",
	"model.layers.1.router.proj.weight",
	"model.layers.1.router.scale",
	"model.layers.10.router.per_expert_scale",
	"model.layers.10.router.proj.weight",
	"model.layers.10.router.scale",
	"model.layers.11.router.per_expert_scale",
	"model.layers.11.router.proj.weight",
	"model.layers.11.router.scale"
	],
	"source_model_id": ""
	},
	"config_path": "/cache/router_artifacts/router_config.json",
	"loaded": true,
	"loaded_key_count": 90,
	"loaded_keys_sample": [
	"model.layers.0.router.per_expert_scale",
	"model.layers.0.router.proj.weight",
	"model.layers.0.router.scale",
	"model.layers.1.router.per_expert_scale",
	"model.layers.1.router.proj.weight",
	"model.layers.1.router.scale",
	"model.layers.10.router.per_expert_scale",
	"model.layers.10.router.proj.weight",
	"model.layers.10.router.scale",
	"model.layers.11.router.per_expert_scale",
	"model.layers.11.router.proj.weight",
	"model.layers.11.router.scale"
	],
	"path": "/cache/router_artifacts/router_state_dict.pt"
	},
	"step": 100,
	"training_metrics": {
	"current_lambda_zero_compute": 3.0,
	"effective_batch_size": 16,
	"entropy_loss": 4.542559911807379,
	"entropy_term": 0.0,
	"expert_usage_sample": [
	1239.0,
	2070.0,
	1500.0,
	1201.0,
	2009.0,
	1821.0,
	670.0,
	1778.0,
	1452.0,
	2154.0,
	2007.0,
	980.0,
	1320.0,
	1568.0,
	1522.0,
	700.0
	],
	"grad_norm": 446.0,
	"gradient_accumulation_steps": 8,
	"lambda_entropy": 0.0,
	"lambda_router": 1.0,
	"lambda_zero_compute": 3.0,
	"loss": 2.6639687418937683,
	"micro_batch_size": 2,
	"output_kl": 0.9797117039561272,
	"output_kl_term": 0.9797117039561272,
	"probe_output_kl": 1.0213916301727295,
	"probe_router_entropy": 4.537958733240763,
	"probe_router_kl": 0.05228007212281227,
	"probe_same_expert_ratio": 0.8512044270833333,
	"probe_zero_compute_loss": 0.5116603970527649,
	"probe_zero_compute_margin_gap": 0.37900154244465134,
	"probe_zero_compute_mass": 0.014502804105480513,
	"probe_zero_compute_top1_ratio": 0.02294921875,
	"probe_zero_compute_topk_ratio": 0.3021158854166667,
	"router_entropy": 4.542559911807379,
	"router_kl": 0.05281998496502638,
	"router_kl_term": 0.05281998496502638,
	"same_expert_ratio": 0.85335693359375,
	"step": 100,
	"tokens_per_optimizer_step": 8192,
	"zero_compute_loss": 0.5438123419880867,
	"zero_compute_margin_gap": 0.4289153911076331,
	"zero_compute_mass": 0.013749805480862657,
	"zero_compute_ramp_steps": 50,
	"zero_compute_term": 1.6314370036125183,
	"zero_compute_top1_ratio": 0.019120279947916666,
	"zero_compute_topk_margin": 0.0,
	"zero_compute_topk_ratio": 0.26689453125,
	"zero_compute_warmup_steps": 0,
	"zero_expert_usage": 4699.0
	}
	},
	"model_config": {
	"add_zero_compute_expert": true,
	"num_experts": 128,
	"top_k_experts": 8,
	"use_zero_compute_optimization": true
	},
	"num_router_keys": 90,
	"router_keys_sample": [
	"model.layers.0.router.per_expert_scale",
	"model.layers.0.router.proj.weight",
	"model.layers.0.router.scale",
	"model.layers.1.router.per_expert_scale",
	"model.layers.1.router.proj.weight",
	"model.layers.1.router.scale",
	"model.layers.10.router.per_expert_scale",
	"model.layers.10.router.proj.weight",
	"model.layers.10.router.scale",
	"model.layers.11.router.per_expert_scale",
	"model.layers.11.router.proj.weight",
	"model.layers.11.router.scale"
	],
	"source_model_id": ""
	},
	"config_path": "/cache/router_artifacts/router_config.json",
	"loaded": true,
	"loaded_key_count": 90,
	"loaded_keys_sample": [
	"model.layers.0.router.per_expert_scale",
	"model.layers.0.router.proj.weight",
	"model.layers.0.router.scale",
	"model.layers.1.router.per_expert_scale",
	"model.layers.1.router.proj.weight",
	"model.layers.1.router.scale",
	"model.layers.10.router.per_expert_scale",
	"model.layers.10.router.proj.weight",
	"model.layers.10.router.scale",
	"model.layers.11.router.per_expert_scale",
	"model.layers.11.router.proj.weight",
	"model.layers.11.router.scale"
	],
	"path": "/cache/router_artifacts/router_state_dict.pt"
	},
	"step": 500,
	"training_metrics": {
	"current_lambda_zero_compute": 3.0,
	"easy_token_ratio": 0.88720703125,
	"effective_batch_size": 16,
	"entropy_loss": 4.538989106814067,
	"entropy_term": 0.009077978213628133,
	"expert_usage_sample": [
	1200.0,
	2028.0,
	1208.0,
	783.0,
	2040.0,
	1847.0,
	497.0,
	1490.0,
	1385.0,
	2217.0,
	1843.0,
	1175.0,
	1406.0,
	1502.0,
	1687.0,
	771.0
	],
	"grad_norm": 204.0,
	"gradient_accumulation_steps": 8,
	"lambda_entropy": 0.002,
	"lambda_router": 1.0,
	"lambda_zero_compute": 3.0,
	"loss": 2.76311457157135,
	"micro_batch_size": 2,
	"output_kl": 1.1192611530423164,
	"output_kl_term": 1.1192611530423164,
	"probe_easy_token_ratio": 0.9248046875,
	"probe_output_kl": 1.1542232036590576,
	"probe_router_entropy": 4.538172864913941,
	"probe_router_kl": 0.0544092059135437,
	"probe_same_expert_ratio": 0.8352864583333334,
	"probe_teacher_confidence_mean": 0.6827144622802734,
	"probe_zero_compute_loss": 0.5005475282669067,
	"probe_zero_compute_margin_gap": 0.3712589807061819,
	"probe_zero_compute_mass": 0.01432527024565543,
	"probe_zero_compute_token_weight_mean": 0.6357069611549377,
	"probe_zero_compute_top1_hits_actual": 568.0,
	"probe_zero_compute_top1_ratio": 0.017643229166666666,
	"probe_zero_compute_top1_ratio_actual": 0.018489583333333334,
	"probe_zero_compute_topk_hits_actual": 10002.0,
	"probe_zero_compute_topk_ratio": 0.31997760956028976,
	"probe_zero_compute_topk_ratio_actual": 0.0406982421875,
	"router_entropy": 4.538989106814067,
	"router_kl": 0.054629013407975435,
	"router_kl_term": 0.054629013407975435,
	"same_expert_ratio": 0.8434204101562499,
	"step": 500,
	"teacher_confidence_mean": 0.6225322559475899,
	"tokens_per_optimizer_step": 8192,
	"zero_compute_loss": 0.5267154797911644,
	"zero_compute_margin_gap": 0.418523011850672,
	"zero_compute_mass": 0.01356517664706988,
	"zero_compute_ramp_steps": 50,
	"zero_compute_term": 1.5801464468240738,
	"zero_compute_token_weight_mean": 0.5501668378710747,
	"zero_compute_top1_hits_actual": 459.0,
	"zero_compute_top1_ratio": 0.014103190104166665,
	"zero_compute_top1_ratio_actual": 0.014941406249999997,
	"zero_compute_topk_hits_actual": 8916.125,
	"zero_compute_topk_margin": 0.0,
	"zero_compute_topk_ratio": 0.27892842232367093,
	"zero_compute_topk_ratio_actual": 0.03627980550130208,
	"zero_compute_warmup_steps": 0,
	"zero_expert_usage": 3466.0
	}
	},
	"model_config": {
	"add_zero_compute_expert": true,
	"num_experts": 128,
	"top_k_experts": 8,
	"use_zero_compute_optimization": true
	},
	"num_router_keys": 90,
	"router_keys_sample": [
	"model.layers.0.router.per_expert_scale",
	"model.layers.0.router.proj.weight",
	"model.layers.0.router.scale",
	"model.layers.1.router.per_expert_scale",
	"model.layers.1.router.proj.weight",
	"model.layers.1.router.scale",
	"model.layers.10.router.per_expert_scale",
	"model.layers.10.router.proj.weight",
	"model.layers.10.router.scale",
	"model.layers.11.router.per_expert_scale",
	"model.layers.11.router.proj.weight",
	"model.layers.11.router.scale"
	],
	"source_model_id": ""
	},
	"config_path": "/cache/router_artifacts/router_config.json",
	"loaded": true,
	"loaded_key_count": 90,
	"loaded_keys_sample": [
	"model.layers.0.router.per_expert_scale",
	"model.layers.0.router.proj.weight",
	"model.layers.0.router.scale",
	"model.layers.1.router.per_expert_scale",
	"model.layers.1.router.proj.weight",
	"model.layers.1.router.scale",
	"model.layers.10.router.per_expert_scale",
	"model.layers.10.router.proj.weight",
	"model.layers.10.router.scale",
	"model.layers.11.router.per_expert_scale",
	"model.layers.11.router.proj.weight",
	"model.layers.11.router.scale"
	],
	"path": "/cache/router_artifacts/router_state_dict.pt"
	},
	"source_model_id": "google/gemma-4-26B-A4B-it",
	"torch_dtype": "torch.bfloat16"
	}