VoxCPM2-LiteRT-INT8 / config.json
aufklarer's picture
INT8 prefill: 7.7 GB FP32 → 1.9 GB INT8 weight-only; audio_encoder/decoder stay FP32 (ai_edge_quantizer rejects Conv ops). Total bundle 10 GB → 4.3 GB, runtime working set 13 GiB → ~5-6 GiB.
9ec65a0 verified
{
"architecture": "voxcpm2",
"lm_config": {
"bos_token_id": 1,
"eos_token_id": 2,
"hidden_size": 2048,
"intermediate_size": 6144,
"max_position_embeddings": 32768,
"num_attention_heads": 16,
"num_hidden_layers": 28,
"num_key_value_heads": 2,
"rms_norm_eps": 1e-05,
"rope_theta": 10000,
"kv_channels": 128,
"rope_scaling": {
"type": "longrope",
"long_factor": [
0.9977997200264581,
1.014658295992452,
1.0349680404997148,
1.059429246056193,
1.0888815016813513,
1.1243301355211495,
1.166977103606075,
1.2182568066927284,
1.2798772354275727,
1.3538666751582975,
1.4426259039919596,
1.5489853358570191,
1.6762658237220625,
1.8283407612492941,
2.0096956085876183,
2.225478927469756,
2.481536379650452,
2.784415934557119,
3.1413289096347365,
3.560047844772632,
4.048719380066383,
4.615569542115128,
5.2684819496549835,
6.014438591970396,
6.858830049237097,
7.804668263503327,
8.851768731513417,
9.99600492938444,
11.228766118181639,
12.536757560834843,
13.902257701387796,
15.303885189125953,
16.717837610115794,
18.119465097853947,
19.484965238406907,
20.792956681060105,
22.02571786985731,
23.16995406772833,
24.217054535738416,
25.16289275000465,
26.007284207271347,
26.753240849586767,
27.40615325712662,
27.973003419175363,
28.461674954469114,
28.880393889607006,
29.237306864684626,
29.540186419591297,
29.79624387177199,
30.01202719065413,
30.193382037992453,
30.34545697551969,
30.47273746338473,
30.579096895249787,
30.66785612408345,
30.741845563814174,
30.80346599254902,
30.85474569563567,
30.897392663720595,
30.932841297560394,
30.962293553185553,
30.986754758742034,
31.007064503249293,
31.02392307921529
],
"short_factor": [
0.9977997200264581,
1.014658295992452,
1.0349680404997148,
1.059429246056193,
1.0888815016813513,
1.1243301355211495,
1.166977103606075,
1.2182568066927284,
1.2798772354275727,
1.3538666751582975,
1.4426259039919596,
1.5489853358570191,
1.6762658237220625,
1.8283407612492941,
2.0096956085876183,
2.225478927469756,
2.481536379650452,
2.784415934557119,
3.1413289096347365,
3.560047844772632,
4.048719380066383,
4.615569542115128,
5.2684819496549835,
6.014438591970396,
6.858830049237097,
7.804668263503327,
8.851768731513417,
9.99600492938444,
11.228766118181639,
12.536757560834843,
13.902257701387796,
15.303885189125953,
16.717837610115794,
18.119465097853947,
19.484965238406907,
20.792956681060105,
22.02571786985731,
23.16995406772833,
24.217054535738416,
25.16289275000465,
26.007284207271347,
26.753240849586767,
27.40615325712662,
27.973003419175363,
28.461674954469114,
28.880393889607006,
29.237306864684626,
29.540186419591297,
29.79624387177199,
30.01202719065413,
30.193382037992453,
30.34545697551969,
30.47273746338473,
30.579096895249787,
30.66785612408345,
30.741845563814174,
30.80346599254902,
30.85474569563567,
30.897392663720595,
30.932841297560394,
30.962293553185553,
30.986754758742034,
31.007064503249293,
31.02392307921529
],
"original_max_position_embeddings": 32768
},
"vocab_size": 73448,
"use_mup": false,
"scale_emb": 12,
"dim_model_base": 256,
"scale_depth": 1.4
},
"patch_size": 4,
"feat_dim": 64,
"scalar_quantization_latent_dim": 512,
"scalar_quantization_scale": 9,
"residual_lm_num_layers": 8,
"residual_lm_no_rope": true,
"encoder_config": {
"hidden_dim": 1024,
"ffn_dim": 4096,
"num_heads": 16,
"num_layers": 12,
"kv_channels": 128
},
"dit_config": {
"hidden_dim": 1024,
"ffn_dim": 4096,
"num_heads": 16,
"num_layers": 12,
"kv_channels": 128,
"mean_mode": false,
"cfm_config": {
"sigma_min": 1e-06,
"solver": "euler",
"t_scheduler": "log-norm",
"inference_cfg_rate": 2.0
}
},
"audio_vae_config": {
"encoder_dim": 128,
"encoder_rates": [
2,
5,
8,
8
],
"latent_dim": 64,
"decoder_dim": 2048,
"decoder_rates": [
8,
6,
5,
2,
2,
2
],
"sr_bin_boundaries": [
20000,
30000,
40000
],
"sample_rate": 16000,
"out_sample_rate": 48000
},
"max_length": 8192,
"device": "cuda",
"dtype": "bfloat16",
"model": "openbmb/VoxCPM2",
"format": "litert",
"variant": "int8",
"sample_rate": 48000,
"audio_conditioning_sample_rate": 16000,
"text_tokenizer": "tokenizer.json",
"max_text_tokens": 256,
"max_generated_tokens": 2048,
"default_inference_timesteps": 10,
"default_cfg_value": 2.0,
"files": {
"text_prefill": "voxcpm2-text-prefill.tflite",
"token_step": "voxcpm2-token-step.tflite",
"audio_encoder": "voxcpm2-audio-encoder.tflite",
"audio_decoder": "voxcpm2-audio-decoder.tflite"
},
"export": {
"source_repo": "openbmb/VoxCPM2",
"voxcpm_src_required": true,
"graph_variants": {
"text_prefill": "int8",
"token_step": "int8",
"audio_encoder": "fp32",
"audio_decoder": "fp32"
},
"token_step_cache_contract": "[2, layers, batch, kv_heads, max_cache_length, head_dim]",
"token_step_status": "experimental until litert_torch cache lowering is verified"
},
"inputs": {
"text_prefill": {
"text_tokens": {
"dtype": "int64",
"rank": 2,
"shape": [
1,
256
],
"name": "serving_default_args_0"
},
"text_mask": {
"dtype": "float32",
"rank": 2,
"shape": [
1,
256
],
"name": "serving_default_args_1"
},
"audio_feats": {
"dtype": "float32",
"rank": 4,
"shape": [
1,
256,
4,
64
],
"name": "serving_default_args_2"
},
"audio_mask": {
"dtype": "float32",
"rank": 2,
"shape": [
1,
256
],
"name": "serving_default_args_3"
},
"context_length": {
"dtype": "int64",
"rank": 0,
"shape": [],
"name": "serving_default_args_4"
}
},
"token_step": {
"lm_hidden": {
"dtype": "float32",
"rank": 2,
"shape": [
1,
2048
],
"name": "serving_default_args_0"
},
"residual_hidden": {
"dtype": "float32",
"rank": 2,
"shape": [
1,
2048
],
"name": "serving_default_args_1"
},
"prefix_feat_cond": {
"dtype": "float32",
"rank": 3,
"shape": [
1,
4,
64
],
"name": "serving_default_args_2"
},
"base_cache": {
"dtype": "float32",
"rank": 6,
"shape": [
2,
28,
1,
2,
2304,
128
],
"name": "serving_default_args_3"
},
"residual_cache": {
"dtype": "float32",
"rank": 6,
"shape": [
2,
8,
1,
2,
2304,
128
],
"name": "serving_default_args_4"
},
"position_id": {
"dtype": "int64",
"rank": 0,
"shape": [],
"name": "serving_default_args_5"
},
"noise": {
"dtype": "float32",
"rank": 3,
"shape": [
1,
64,
4
],
"name": "serving_default_args_6"
}
},
"audio_encoder": {
"audio": {
"dtype": "float32",
"rank": 2,
"shape": [
1,
102400
],
"name": "serving_default_args_0"
}
},
"audio_decoder": {
"latent": {
"dtype": "float32",
"rank": 3,
"shape": [
1,
64,
256
],
"name": "serving_default_args_0"
}
}
},
"outputs": {
"text_prefill": {
"lm_hidden": {
"dtype": "float32",
"rank": 2,
"shape": [
1,
2048
],
"name": "serving_default_output_0_output"
},
"residual_hidden": {
"dtype": "float32",
"rank": 2,
"shape": [
1,
2048
],
"name": "serving_default_output_1_output"
},
"prefix_feat_cond": {
"dtype": "float32",
"rank": 3,
"shape": [
1,
4,
64
],
"name": "serving_default_output_2_output"
},
"base_cache": {
"dtype": "float32",
"rank": 6,
"shape": [
2,
28,
1,
2,
256,
128
],
"name": "serving_default_output_3_output"
},
"residual_cache": {
"dtype": "float32",
"rank": 6,
"shape": [
2,
8,
1,
2,
256,
128
],
"name": "serving_default_output_4_output"
}
},
"token_step": {
"pred_feat": {
"dtype": "float32",
"rank": 3,
"shape": [
1,
4,
64
],
"name": "serving_default_output_0_output"
},
"stop_logits": {
"dtype": "float32",
"rank": 2,
"shape": [
1,
2
],
"name": "serving_default_output_1_output"
},
"next_lm_hidden": {
"dtype": "float32",
"rank": 2,
"shape": [
1,
2048
],
"name": "serving_default_output_2_output"
},
"next_residual_hidden": {
"dtype": "float32",
"rank": 2,
"shape": [
1,
2048
],
"name": "serving_default_output_3_output"
},
"base_cache": {
"dtype": "float32",
"rank": 6,
"shape": [
2,
28,
1,
2,
2304,
128
],
"name": "serving_default_output_4_output"
},
"residual_cache": {
"dtype": "float32",
"rank": 6,
"shape": [
2,
8,
1,
2,
2304,
128
],
"name": "serving_default_output_5_output"
}
},
"audio_encoder": {
"audio_feats": {
"dtype": "float32",
"rank": 4,
"shape": [
1,
40,
4,
64
],
"name": "serving_default_output_0_output"
}
},
"audio_decoder": {
"pcm": {
"dtype": "float32",
"rank": 2,
"shape": [
1,
491520
],
"name": "serving_default_output_0_output"
}
}
}
}