Text-to-Speech
F5-TTS
English
Chinese
flow_matching_dit
voice-cloning
flow-matching
zero-shot-tts
rajkr commited on
Commit
a5efda4
·
verified ·
1 Parent(s): cf199d1

Upload config.json

Browse files
Files changed (1) hide show
  1. config.json +64 -0
config.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "F5TTS_v1_Base",
3
+ "model_type": "flow_matching_dit",
4
+ "base_model": "SWivid/F5-TTS",
5
+ "parameters": 337000000,
6
+ "architecture": {
7
+ "type": "Conditional Flow Matching (CFM) with Diffusion Transformer (DiT)",
8
+ "backbone": "DiT",
9
+ "dim": 1024,
10
+ "depth": 22,
11
+ "heads": 16,
12
+ "ff_mult": 2,
13
+ "text_dim": 512,
14
+ "conv_layers": 4,
15
+ "text_num_embeds": 2545,
16
+ "mel_dim": 100
17
+ },
18
+ "mel_spec": {
19
+ "n_fft": 1024,
20
+ "hop_length": 256,
21
+ "win_length": 1024,
22
+ "n_mel_channels": 100,
23
+ "target_sample_rate": 24000,
24
+ "mel_spec_type": "vocos"
25
+ },
26
+ "training": {
27
+ "pretrained_checkpoint": "hf://SWivid/F5-TTS/F5TTS_v1_Base/model_1250000.safetensors",
28
+ "pretrained_steps": 1250000,
29
+ "dataset": "amphion/Emilia-Dataset",
30
+ "dataset_size": "~95000 hours",
31
+ "optimizer": "AdamW",
32
+ "learning_rate": 7.5e-5,
33
+ "warmup_steps": 20000,
34
+ "total_steps": 1250000,
35
+ "batch_size_frames": 307200,
36
+ "precision": "bf16",
37
+ "hardware": "8x NVIDIA A100 80GB",
38
+ "grad_clip": 1.0,
39
+ "masking_range": [0.7, 1.0],
40
+ "audio_drop_prob": 0.3,
41
+ "cond_drop_prob": 0.2
42
+ },
43
+ "fine_tune_recommendation": {
44
+ "learning_rate": 1e-5,
45
+ "epochs": 10,
46
+ "batch_size_per_gpu_frames": 19200,
47
+ "grad_accumulation_steps": 2,
48
+ "num_warmup_updates": 500,
49
+ "dataset": "mythicinfinity/libritts_r",
50
+ "min_reference_audio": "3 seconds",
51
+ "max_reference_audio": "10 seconds"
52
+ },
53
+ "inference": {
54
+ "default_nfe_step": 32,
55
+ "cfg_strength": 1.0,
56
+ "sway_sampling_coef": -1.0,
57
+ "speed": 1.0,
58
+ "real_time_factor": 0.15,
59
+ "max_duration": 4096
60
+ },
61
+ "languages": ["en", "zh"],
62
+ "task": "text-to-speech",
63
+ "tags": ["f5-tts", "voice-cloning", "zero-shot-tts", "flow-matching", "dit"]
64
+ }