{ "export_timestamp": "[0.]", "models": [ { "model_type": "spectrostream_encoder", "input_name": "audio", "input_shape": [ 1, 96000, 2 ], "output_name": "embeddings", "output_shape": [ 1, 50, 256 ], "sample_rate": 48000, "frame_rate": 25.0, "embedding_dim": 256, "opset_version": 18, "ir_version": 8, "precision": "fp16" }, { "model_type": "spectrostream_decoder", "input_name": "tokens", "input_shape": [ 1, 50, 64 ], "output_name": "audio", "output_shape": [ 1, 96000, 2 ], "sample_rate": 48000, "frame_rate": 25.0, "rvq_depth": 64, "rvq_codebook_size": 1024, "opset_version": 18, "ir_version": 8, "precision": "fp16" }, { "model_type": "musiccoca_text_encoder", "inputs": [ { "name": "text_ids", "shape": [ 1, 128 ], "dtype": "int32" }, { "name": "padding", "shape": [ 1, 128 ], "dtype": "float32" } ], "output_name": "embedding", "output_shape": [ 1, 768 ], "max_text_length": 128, "embedding_dim": 768, "opset_version": 18, "ir_version": 8, "precision": "fp16" }, { "model_type": "musiccoca_audio_encoder", "input_name": "audio", "input_shape": [ 1, 160000 ], "output_name": "embedding", "output_shape": [ 1, 768 ], "sample_rate": 16000, "clip_length": 10.0, "embedding_dim": 768, "opset_version": 18, "ir_version": 8, "precision": "fp16" }, { "model_type": "musiccoca_rvq_quantizer", "input_name": "embedding", "input_shape": [ 1, 768 ], "output_name": "tokens", "output_shape": [ 1, 12 ], "embedding_dim": 768, "rvq_depth": 12, "rvq_codebook_size": 1024, "opset_version": 18, "ir_version": 8, "precision": "fp16" }, { "model_type": "depthformer_base_encoder", "inputs": [ { "name": "context_tokens", "shape": [ 1, 1006 ], "dtype": "int32" }, { "name": "style_tokens", "shape": [ 1, 6 ], "dtype": "int32" } ], "output_name": "encoder_hidden_states", "output_shape": [ 1, 1006, 768 ], "model_config": { "embed_dim": 768, "num_heads": 12, "num_encoder_layers": 12, "mlp_dim": 2048 }, "token_config": { "context_length": 1006, "style_rvq_depth": 6, "rvq_codebook_size": 1024 }, "opset_version": 18, "ir_version": 8, "precision": "fp16" }, { "model_type": "depthformer_base_decoder_step", "inputs": [ { "name": "target_token", "shape": [ 1, 1 ], "dtype": "int32" }, { "name": "encoder_hidden_states", "shape": [ 1, 1006, 768 ], "dtype": "float32" }, { "name": "kv_cache_keys", "shape": "dynamic", "dtype": "float32" }, { "name": "kv_cache_values", "shape": "dynamic", "dtype": "float32" } ], "outputs": [ { "name": "logits", "shape": [ 1, 16384 ], "dtype": "float32" }, { "name": "new_kv_cache_keys", "shape": "dynamic", "dtype": "float32" }, { "name": "new_kv_cache_values", "shape": "dynamic", "dtype": "float32" } ], "model_config": { "embed_dim": 768, "num_heads": 12, "num_decoder_layers": 12, "mlp_dim": 2048 }, "kv_cache": { "max_length": 1806, "num_heads": 12, "head_dim": 64 }, "opset_version": 18, "ir_version": 8, "precision": "fp16" } ], "config": { "opset_version": 18, "ir_version": 8, "precision": "fp16" } }