rajkr
/

voice-clone-f5tts

flow_matching_dit

Model card Files Files and versions

voice-clone-f5tts / config.json

rajkr's picture

Upload config.json

a5efda4 verified 13 days ago

history blame contribute delete

1.76 kB

	{
	"model_name": "F5TTS_v1_Base",
	"model_type": "flow_matching_dit",
	"base_model": "SWivid/F5-TTS",
	"parameters": 337000000,
	"architecture": {
	"type": "Conditional Flow Matching (CFM) with Diffusion Transformer (DiT)",
	"backbone": "DiT",
	"dim": 1024,
	"depth": 22,
	"heads": 16,
	"ff_mult": 2,
	"text_dim": 512,
	"conv_layers": 4,
	"text_num_embeds": 2545,
	"mel_dim": 100
	},
	"mel_spec": {
	"n_fft": 1024,
	"hop_length": 256,
	"win_length": 1024,
	"n_mel_channels": 100,
	"target_sample_rate": 24000,
	"mel_spec_type": "vocos"
	},
	"training": {
	"pretrained_checkpoint": "hf://SWivid/F5-TTS/F5TTS_v1_Base/model_1250000.safetensors",
	"pretrained_steps": 1250000,
	"dataset": "amphion/Emilia-Dataset",
	"dataset_size": "~95000 hours",
	"optimizer": "AdamW",
	"learning_rate": 7.5e-5,
	"warmup_steps": 20000,
	"total_steps": 1250000,
	"batch_size_frames": 307200,
	"precision": "bf16",
	"hardware": "8x NVIDIA A100 80GB",
	"grad_clip": 1.0,
	"masking_range": [0.7, 1.0],
	"audio_drop_prob": 0.3,
	"cond_drop_prob": 0.2
	},
	"fine_tune_recommendation": {
	"learning_rate": 1e-5,
	"epochs": 10,
	"batch_size_per_gpu_frames": 19200,
	"grad_accumulation_steps": 2,
	"num_warmup_updates": 500,
	"dataset": "mythicinfinity/libritts_r",
	"min_reference_audio": "3 seconds",
	"max_reference_audio": "10 seconds"
	},
	"inference": {
	"default_nfe_step": 32,
	"cfg_strength": 1.0,
	"sway_sampling_coef": -1.0,
	"speed": 1.0,
	"real_time_factor": 0.15,
	"max_duration": 4096
	},
	"languages": ["en", "zh"],
	"task": "text-to-speech",
	"tags": ["f5-tts", "voice-cloning", "zero-shot-tts", "flow-matching", "dit"]
	}