rajkr
/

voice-clone-f5tts

flow_matching_dit

Model card Files Files and versions

rajkr commited on 15 days ago

Commit

a5efda4

·

verified ·

1 Parent(s): cf199d1

Upload config.json

Files changed (1) hide show

config.json +64 -0

config.json ADDED Viewed

	@@ -0,0 +1,64 @@

+{
+  "model_name": "F5TTS_v1_Base",
+  "model_type": "flow_matching_dit",
+  "base_model": "SWivid/F5-TTS",
+  "parameters": 337000000,
+  "architecture": {
+    "type": "Conditional Flow Matching (CFM) with Diffusion Transformer (DiT)",
+    "backbone": "DiT",
+    "dim": 1024,
+    "depth": 22,
+    "heads": 16,
+    "ff_mult": 2,
+    "text_dim": 512,
+    "conv_layers": 4,
+    "text_num_embeds": 2545,
+    "mel_dim": 100
+  },
+  "mel_spec": {
+    "n_fft": 1024,
+    "hop_length": 256,
+    "win_length": 1024,
+    "n_mel_channels": 100,
+    "target_sample_rate": 24000,
+    "mel_spec_type": "vocos"
+  },
+  "training": {
+    "pretrained_checkpoint": "hf://SWivid/F5-TTS/F5TTS_v1_Base/model_1250000.safetensors",
+    "pretrained_steps": 1250000,
+    "dataset": "amphion/Emilia-Dataset",
+    "dataset_size": "~95000 hours",
+    "optimizer": "AdamW",
+    "learning_rate": 7.5e-5,
+    "warmup_steps": 20000,
+    "total_steps": 1250000,
+    "batch_size_frames": 307200,
+    "precision": "bf16",
+    "hardware": "8x NVIDIA A100 80GB",
+    "grad_clip": 1.0,
+    "masking_range": [0.7, 1.0],
+    "audio_drop_prob": 0.3,
+    "cond_drop_prob": 0.2
+  },
+  "fine_tune_recommendation": {
+    "learning_rate": 1e-5,
+    "epochs": 10,
+    "batch_size_per_gpu_frames": 19200,
+    "grad_accumulation_steps": 2,
+    "num_warmup_updates": 500,
+    "dataset": "mythicinfinity/libritts_r",
+    "min_reference_audio": "3 seconds",
+    "max_reference_audio": "10 seconds"
+  },
+  "inference": {
+    "default_nfe_step": 32,
+    "cfg_strength": 1.0,
+    "sway_sampling_coef": -1.0,
+    "speed": 1.0,
+    "real_time_factor": 0.15,
+    "max_duration": 4096
+  },
+  "languages": ["en", "zh"],
+  "task": "text-to-speech",
+  "tags": ["f5-tts", "voice-cloning", "zero-shot-tts", "flow-matching", "dit"]
+}