| { |
| "model_name": "F5TTS_v1_Base", |
| "model_type": "flow_matching_dit", |
| "base_model": "SWivid/F5-TTS", |
| "parameters": 337000000, |
| "architecture": { |
| "type": "Conditional Flow Matching (CFM) with Diffusion Transformer (DiT)", |
| "backbone": "DiT", |
| "dim": 1024, |
| "depth": 22, |
| "heads": 16, |
| "ff_mult": 2, |
| "text_dim": 512, |
| "conv_layers": 4, |
| "text_num_embeds": 2545, |
| "mel_dim": 100 |
| }, |
| "mel_spec": { |
| "n_fft": 1024, |
| "hop_length": 256, |
| "win_length": 1024, |
| "n_mel_channels": 100, |
| "target_sample_rate": 24000, |
| "mel_spec_type": "vocos" |
| }, |
| "training": { |
| "pretrained_checkpoint": "hf://SWivid/F5-TTS/F5TTS_v1_Base/model_1250000.safetensors", |
| "pretrained_steps": 1250000, |
| "dataset": "amphion/Emilia-Dataset", |
| "dataset_size": "~95000 hours", |
| "optimizer": "AdamW", |
| "learning_rate": 7.5e-5, |
| "warmup_steps": 20000, |
| "total_steps": 1250000, |
| "batch_size_frames": 307200, |
| "precision": "bf16", |
| "hardware": "8x NVIDIA A100 80GB", |
| "grad_clip": 1.0, |
| "masking_range": [0.7, 1.0], |
| "audio_drop_prob": 0.3, |
| "cond_drop_prob": 0.2 |
| }, |
| "fine_tune_recommendation": { |
| "learning_rate": 1e-5, |
| "epochs": 10, |
| "batch_size_per_gpu_frames": 19200, |
| "grad_accumulation_steps": 2, |
| "num_warmup_updates": 500, |
| "dataset": "mythicinfinity/libritts_r", |
| "min_reference_audio": "3 seconds", |
| "max_reference_audio": "10 seconds" |
| }, |
| "inference": { |
| "default_nfe_step": 32, |
| "cfg_strength": 1.0, |
| "sway_sampling_coef": -1.0, |
| "speed": 1.0, |
| "real_time_factor": 0.15, |
| "max_duration": 4096 |
| }, |
| "languages": ["en", "zh"], |
| "task": "text-to-speech", |
| "tags": ["f5-tts", "voice-cloning", "zero-shot-tts", "flow-matching", "dit"] |
| } |
|
|