LH-Tech-AI commited on
Commit
1f436a9
·
verified ·
1 Parent(s): 6104320

Create DELETE____setup.txt

Browse files
Files changed (1) hide show
  1. DELETE____setup.txt +171 -0
DELETE____setup.txt ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ INSTALLS AND PREPARATION:
2
+
3
+ pip install git+https://github.com/idiap/coqui-tts.git
4
+ sudo apt update && sudo apt install espeak -y
5
+ sudo apt install ffmpeg libavcodec-dev libavformat-dev libavutil-dev -y
6
+ pip install "coqui-tts[codec]"
7
+ wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
8
+ tar -xjf LJSpeech-1.1.tar.bz2
9
+ wget https://huggingface.co/LH-Tech-AI/Flare-TTS-28M/resolve/main/model.pth
10
+ wget https://huggingface.co/LH-Tech-AI/Flare-TTS-28M/resolve/main/config.json
11
+
12
+ nano train_vocoder.py
13
+
14
+ Das einfügen:
15
+ ####################################################################################################################
16
+
17
+ # train_vocoder.py
18
+ # HiFi-GAN v1 from scratch auf LJSpeech, Audio-Config passend zu GlowTTS-Modell
19
+ import os
20
+ from trainer import Trainer, TrainerArgs
21
+ from TTS.utils.audio import AudioProcessor
22
+ from TTS.config.shared_configs import BaseAudioConfig
23
+ from TTS.vocoder.configs import HifiganConfig
24
+ from TTS.vocoder.datasets.preprocess import load_wav_data
25
+ from TTS.vocoder.models.gan import GAN
26
+
27
+
28
+ def main():
29
+ output_path = os.path.dirname(os.path.abspath(__file__))
30
+ data_path = os.path.join(output_path, "LJSpeech-1.1/wavs/")
31
+
32
+ # ===========================================================
33
+ # AUDIO-CONFIG: muss EXAKT zu deiner GlowTTS-config.json passen!
34
+ # Diese Werte = Coqui-Defaults, mit denen GlowTTS trainiert wurde.
35
+ # ===========================================================
36
+ audio_config = BaseAudioConfig(
37
+ sample_rate=22050,
38
+ resample=False,
39
+ do_trim_silence=True,
40
+ trim_db=45,
41
+
42
+ # STFT
43
+ fft_size=1024,
44
+ win_length=1024,
45
+ hop_length=256,
46
+ frame_shift_ms=None,
47
+ frame_length_ms=None,
48
+
49
+ # Mel
50
+ num_mels=80,
51
+ mel_fmin=0.0,
52
+ mel_fmax=None, # <-- dein GlowTTS-Wert (null)
53
+
54
+ # Normalisierung & Skalierung – DEINE GlowTTS-Werte
55
+ signal_norm=True,
56
+ symmetric_norm=True,
57
+ max_norm=4.0,
58
+ clip_norm=True,
59
+ ref_level_db=20,
60
+ min_level_db=-100,
61
+ spec_gain=20.0, # <-- dein GlowTTS-Wert
62
+ log_func="np.log10", # <-- dein GlowTTS-Wert
63
+ preemphasis=0.0,
64
+
65
+ # Stats
66
+ stats_path=None,
67
+ )
68
+
69
+ # ===========================================================
70
+ # HiFi-GAN v1 Config (Standard-Variante, beste Qualität)
71
+ # ===========================================================
72
+ config = HifiganConfig(
73
+ run_name="hifigan_ljspeech",
74
+ run_description="HiFi-GAN v1 from scratch, GlowTTS-compatible mels",
75
+
76
+ # Daten
77
+ data_path=data_path,
78
+ output_path=output_path,
79
+ eval_split_size=10,
80
+
81
+ # Audio
82
+ audio=audio_config,
83
+
84
+ # Training
85
+ epochs=2000, # bricht eh nach Steps; egal
86
+ batch_size=32, # A6000: 32 ist sicherer Start; ggf. 48/64
87
+ eval_batch_size=16,
88
+ num_loader_workers=4,
89
+ num_eval_loader_workers=2,
90
+ run_eval=True,
91
+ test_delay_epochs=5,
92
+ mixed_precision=True,
93
+
94
+ # Vocoder-spezifisch
95
+ seq_len=8192, # Waveform-Snippet-Länge pro Sample
96
+ pad_short=2000,
97
+ use_noise_augment=True,
98
+
99
+ # Optimizer
100
+ lr_gen=2e-4, # HiFi-GAN-Paper-Default
101
+ lr_disc=2e-4,
102
+
103
+ # Logging / Saving
104
+ print_step=50,
105
+ print_eval=False,
106
+ save_step=5000,
107
+ save_n_checkpoints=5,
108
+ save_checkpoints=True,
109
+ log_model_step=10000,
110
+ plot_step=500,
111
+ )
112
+
113
+ # AudioProcessor (passt sich aus config.audio an)
114
+ ap = AudioProcessor(config=config.audio)
115
+
116
+ # Daten laden
117
+ eval_samples, train_samples = load_wav_data(
118
+ config.data_path,
119
+ config.eval_split_size,
120
+ )
121
+
122
+ # Modell
123
+ model = GAN(config)
124
+
125
+ # Trainer (kein restore_path = from scratch)
126
+ trainer = Trainer(
127
+ TrainerArgs(),
128
+ config,
129
+ output_path,
130
+ model=model,
131
+ train_samples=train_samples,
132
+ eval_samples=eval_samples,
133
+ training_assets={"audio_processor": ap},
134
+ )
135
+
136
+ trainer.fit()
137
+
138
+
139
+ if __name__ == "__main__":
140
+ main()
141
+
142
+ ####################################################################################################################
143
+
144
+ ⚠️ Wichtigster Schritt: Audio-Params verifizieren
145
+ - Öffne deine alte GlowTTS config.json und vergleich mit dem audio_config oben.
146
+ - Falls bei dir z.B. trim_db=60 oder ref_level_db=20 anders ist → hier anpassen.
147
+ Ein einziger abweichender Wert = Vocoder lernt falsche Mel-Verteilung = wieder Müll-Sound.
148
+ - Quick Check: python3 -c "import json; c=json.load(open('config.json')); print(json.dumps(c['audio'], indent=2))"
149
+ --> Output mit dem audio_config oben abgleichen.
150
+
151
+ STARTEN:
152
+
153
+ screen -S vocoder
154
+ PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True python3 train_vocoder.py
155
+
156
+ INFERENCE:
157
+
158
+ tts --text "Today, LH-Tech AI is introducing a new type of model: text to speech. You are currently listening to audio that was completely generated by Flare-TTS." \
159
+ --model_path ./model.pth \
160
+ --config_path ./config.json \
161
+ --vocoder_path ./run-XXXXX/best_model.pth \
162
+ --vocoder_config_path ./run-XXXXX/config.json \
163
+ --out_path output_flare.wav
164
+
165
+
166
+ ERWARTUNG:
167
+
168
+ - 30 min (~10k Steps): Sprache erkennbar, noch rauschig
169
+ - 2h (~50k Steps): brauchbar, deutlich besser als Griffin-Lim
170
+ - 4–6h (~100–150k Steps): gute Qualität ✅
171
+ - 10h+: production-grade