inclusionAI
/

MingTok-Audio

Model card Files Files and versions

MingTok-Audio / config.json

yongjielv's picture

Upload folder using huggingface_hub

a1075ac verified 7 months ago

history blame contribute delete

3.19 kB

	{
	"architectures": [
	"AudioVAE"
	],
	"dec_kwargs": {
	"backbone": {
	"_attn_implementation": "flash_attention_2",
	"attention_dropout": 0.0,
	"attn_implementation": null,
	"bos_token_id": 151643,
	"eos_token_id": 151645,
	"hidden_act": "silu",
	"hidden_size": 896,
	"initializer_range": 0.02,
	"intermediate_size": 4864,
	"is_causal": true,
	"max_position_embeddings": 32768,
	"max_window_layers": 0,
	"model_type": "qwen2",
	"num_attention_heads": 14,
	"num_hidden_layers": 24,
	"num_key_value_heads": 2,
	"rms_norm_eps": 1e-06,
	"rope_theta": 1000000.0,
	"sliding_window": 32,
	"tie_word_embeddings": true,
	"torch_dtype": "bfloat16",
	"transformers_version": "4.43.1",
	"use_cache": false,
	"use_sliding_window": true,
	"vocab_size": 1
	},
	"latent_dim": 64,
	"output_dim": 320
	},
	"enc_kwargs": {
	"backbone": {
	"_attn_implementation": "flash_attention_2",
	"attention_dropout": 0.0,
	"attn_implementation": null,
	"bos_token_id": 151643,
	"eos_token_id": 151645,
	"hidden_act": "silu",
	"hidden_size": 896,
	"initializer_range": 0.02,
	"intermediate_size": 4864,
	"is_causal": true,
	"max_position_embeddings": 32768,
	"max_window_layers": 0,
	"model_type": "qwen2",
	"num_attention_heads": 14,
	"num_hidden_layers": 24,
	"num_key_value_heads": 2,
	"rms_norm_eps": 1e-06,
	"rope_theta": 1000000.0,
	"sliding_window": 32,
	"tie_word_embeddings": true,
	"torch_dtype": "bfloat16",
	"transformers_version": "4.43.1",
	"use_cache": false,
	"use_sliding_window": true,
	"vocab_size": 1
	},
	"hop_size": 320,
	"input_dim": 320,
	"latent_dim": 64
	},
	"hifi_gan_disc_kwargs": {
	"channel_increasing_factor": 4,
	"channels": 16,
	"max_downsample_channels": 512,
	"periods": [
	2,
	3,
	5,
	7,
	11
	]
	},
	"init_method": "kaiming",
	"lambda_adv": 1.0,
	"lambda_disc": 1.0,
	"lambda_feat_match_loss": 1.0,
	"lambda_mel_loss": 1.0,
	"lambda_semantic": 2.0,
	"patch_size": -1,
	"semantic_module_kwargs": {
	"causal": true,
	"whisper_encoder": {
	"n_ctx": 1500,
	"n_head": 20,
	"n_layer": 32,
	"n_mels": 128,
	"n_state": 1280
	}
	},
	"spec_disc_kwargs": {
	"channels": 32,
	"downsample_scales": [
	2,
	2,
	2
	],
	"in_channels": 1,
	"kernel_sizes": [
	5,
	3
	],
	"max_downsample_channels": 512,
	"out_channels": 1,
	"stft_params": {
	"fft_sizes": [
	78,
	126,
	206,
	334,
	542,
	876,
	1418,
	2296
	],
	"hop_sizes": [
	39,
	63,
	103,
	167,
	271,
	438,
	709,
	1148
	],
	"win_lengths": [
	78,
	126,
	206,
	334,
	542,
	876,
	1418,
	2296
	],
	"window": "hann_window"
	},
	"use_weight_norm": true
	},
	"torch_dtype": "bfloat16",
	"transformers_version": "4.52.4"
	}