{
  "model_name": "F5TTS_v1_Base",
  "model_type": "flow_matching_dit",
  "base_model": "SWivid/F5-TTS",
  "parameters": 337000000,
  "architecture": {
    "type": "Conditional Flow Matching (CFM) with Diffusion Transformer (DiT)",
    "backbone": "DiT",
    "dim": 1024,
    "depth": 22,
    "heads": 16,
    "ff_mult": 2,
    "text_dim": 512,
    "conv_layers": 4,
    "text_num_embeds": 2545,
    "mel_dim": 100
  },
  "mel_spec": {
    "n_fft": 1024,
    "hop_length": 256,
    "win_length": 1024,
    "n_mel_channels": 100,
    "target_sample_rate": 24000,
    "mel_spec_type": "vocos"
  },
  "training": {
    "pretrained_checkpoint": "hf://SWivid/F5-TTS/F5TTS_v1_Base/model_1250000.safetensors",
    "pretrained_steps": 1250000,
    "dataset": "amphion/Emilia-Dataset",
    "dataset_size": "~95000 hours",
    "optimizer": "AdamW",
    "learning_rate": 7.5e-5,
    "warmup_steps": 20000,
    "total_steps": 1250000,
    "batch_size_frames": 307200,
    "precision": "bf16",
    "hardware": "8x NVIDIA A100 80GB",
    "grad_clip": 1.0,
    "masking_range": [0.7, 1.0],
    "audio_drop_prob": 0.3,
    "cond_drop_prob": 0.2
  },
  "fine_tune_recommendation": {
    "learning_rate": 1e-5,
    "epochs": 10,
    "batch_size_per_gpu_frames": 19200,
    "grad_accumulation_steps": 2,
    "num_warmup_updates": 500,
    "dataset": "mythicinfinity/libritts_r",
    "min_reference_audio": "3 seconds",
    "max_reference_audio": "10 seconds"
  },
  "inference": {
    "default_nfe_step": 32,
    "cfg_strength": 1.0,
    "sway_sampling_coef": -1.0,
    "speed": 1.0,
    "real_time_factor": 0.15,
    "max_duration": 4096
  },
  "languages": ["en", "zh"],
  "task": "text-to-speech",
  "tags": ["f5-tts", "voice-cloning", "zero-shot-tts", "flow-matching", "dit"]
}