{ "model_name": "F5TTS_v1_Base", "model_type": "flow_matching_dit", "base_model": "SWivid/F5-TTS", "parameters": 337000000, "architecture": { "type": "Conditional Flow Matching (CFM) with Diffusion Transformer (DiT)", "backbone": "DiT", "dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "conv_layers": 4, "text_num_embeds": 2545, "mel_dim": 100 }, "mel_spec": { "n_fft": 1024, "hop_length": 256, "win_length": 1024, "n_mel_channels": 100, "target_sample_rate": 24000, "mel_spec_type": "vocos" }, "training": { "pretrained_checkpoint": "hf://SWivid/F5-TTS/F5TTS_v1_Base/model_1250000.safetensors", "pretrained_steps": 1250000, "dataset": "amphion/Emilia-Dataset", "dataset_size": "~95000 hours", "optimizer": "AdamW", "learning_rate": 7.5e-5, "warmup_steps": 20000, "total_steps": 1250000, "batch_size_frames": 307200, "precision": "bf16", "hardware": "8x NVIDIA A100 80GB", "grad_clip": 1.0, "masking_range": [0.7, 1.0], "audio_drop_prob": 0.3, "cond_drop_prob": 0.2 }, "fine_tune_recommendation": { "learning_rate": 1e-5, "epochs": 10, "batch_size_per_gpu_frames": 19200, "grad_accumulation_steps": 2, "num_warmup_updates": 500, "dataset": "mythicinfinity/libritts_r", "min_reference_audio": "3 seconds", "max_reference_audio": "10 seconds" }, "inference": { "default_nfe_step": 32, "cfg_strength": 1.0, "sway_sampling_coef": -1.0, "speed": 1.0, "real_time_factor": 0.15, "max_duration": 4096 }, "languages": ["en", "zh"], "task": "text-to-speech", "tags": ["f5-tts", "voice-cloning", "zero-shot-tts", "flow-matching", "dit"] }