Text-to-Speech
F5-TTS
English
Chinese
flow_matching_dit
voice-cloning
flow-matching
zero-shot-tts
voice-clone-f5tts / config.json
rajkr's picture
Upload config.json
a5efda4 verified
{
"model_name": "F5TTS_v1_Base",
"model_type": "flow_matching_dit",
"base_model": "SWivid/F5-TTS",
"parameters": 337000000,
"architecture": {
"type": "Conditional Flow Matching (CFM) with Diffusion Transformer (DiT)",
"backbone": "DiT",
"dim": 1024,
"depth": 22,
"heads": 16,
"ff_mult": 2,
"text_dim": 512,
"conv_layers": 4,
"text_num_embeds": 2545,
"mel_dim": 100
},
"mel_spec": {
"n_fft": 1024,
"hop_length": 256,
"win_length": 1024,
"n_mel_channels": 100,
"target_sample_rate": 24000,
"mel_spec_type": "vocos"
},
"training": {
"pretrained_checkpoint": "hf://SWivid/F5-TTS/F5TTS_v1_Base/model_1250000.safetensors",
"pretrained_steps": 1250000,
"dataset": "amphion/Emilia-Dataset",
"dataset_size": "~95000 hours",
"optimizer": "AdamW",
"learning_rate": 7.5e-5,
"warmup_steps": 20000,
"total_steps": 1250000,
"batch_size_frames": 307200,
"precision": "bf16",
"hardware": "8x NVIDIA A100 80GB",
"grad_clip": 1.0,
"masking_range": [0.7, 1.0],
"audio_drop_prob": 0.3,
"cond_drop_prob": 0.2
},
"fine_tune_recommendation": {
"learning_rate": 1e-5,
"epochs": 10,
"batch_size_per_gpu_frames": 19200,
"grad_accumulation_steps": 2,
"num_warmup_updates": 500,
"dataset": "mythicinfinity/libritts_r",
"min_reference_audio": "3 seconds",
"max_reference_audio": "10 seconds"
},
"inference": {
"default_nfe_step": 32,
"cfg_strength": 1.0,
"sway_sampling_coef": -1.0,
"speed": 1.0,
"real_time_factor": 0.15,
"max_duration": 4096
},
"languages": ["en", "zh"],
"task": "text-to-speech",
"tags": ["f5-tts", "voice-cloning", "zero-shot-tts", "flow-matching", "dit"]
}