LH-Tech-AI commited on
Commit
71bb9eb
·
verified ·
1 Parent(s): a4d2fd0

Delete DELETE____setup.txt

Browse files
Files changed (1) hide show
  1. DELETE____setup.txt +0 -171
DELETE____setup.txt DELETED
@@ -1,171 +0,0 @@
1
- INSTALLS AND PREPARATION:
2
-
3
- pip install git+https://github.com/idiap/coqui-tts.git
4
- sudo apt update && sudo apt install espeak -y
5
- sudo apt install ffmpeg libavcodec-dev libavformat-dev libavutil-dev -y
6
- pip install "coqui-tts[codec]"
7
- wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
8
- tar -xjf LJSpeech-1.1.tar.bz2
9
- wget https://huggingface.co/LH-Tech-AI/Flare-TTS-28M/resolve/main/model.pth
10
- wget https://huggingface.co/LH-Tech-AI/Flare-TTS-28M/resolve/main/config.json
11
-
12
- nano train_vocoder.py
13
-
14
- Das einfügen:
15
- ####################################################################################################################
16
-
17
- # train_vocoder.py
18
- # HiFi-GAN v1 from scratch auf LJSpeech, Audio-Config passend zu GlowTTS-Modell
19
- import os
20
- from trainer import Trainer, TrainerArgs
21
- from TTS.utils.audio import AudioProcessor
22
- from TTS.config.shared_configs import BaseAudioConfig
23
- from TTS.vocoder.configs import HifiganConfig
24
- from TTS.vocoder.datasets.preprocess import load_wav_data
25
- from TTS.vocoder.models.gan import GAN
26
-
27
-
28
- def main():
29
- output_path = os.path.dirname(os.path.abspath(__file__))
30
- data_path = os.path.join(output_path, "LJSpeech-1.1/wavs/")
31
-
32
- # ===========================================================
33
- # AUDIO-CONFIG: muss EXAKT zu deiner GlowTTS-config.json passen!
34
- # Diese Werte = Coqui-Defaults, mit denen GlowTTS trainiert wurde.
35
- # ===========================================================
36
- audio_config = BaseAudioConfig(
37
- sample_rate=22050,
38
- resample=False,
39
- do_trim_silence=True,
40
- trim_db=45,
41
-
42
- # STFT
43
- fft_size=1024,
44
- win_length=1024,
45
- hop_length=256,
46
- frame_shift_ms=None,
47
- frame_length_ms=None,
48
-
49
- # Mel
50
- num_mels=80,
51
- mel_fmin=0.0,
52
- mel_fmax=None, # <-- dein GlowTTS-Wert (null)
53
-
54
- # Normalisierung & Skalierung – DEINE GlowTTS-Werte
55
- signal_norm=True,
56
- symmetric_norm=True,
57
- max_norm=4.0,
58
- clip_norm=True,
59
- ref_level_db=20,
60
- min_level_db=-100,
61
- spec_gain=20.0, # <-- dein GlowTTS-Wert
62
- log_func="np.log10", # <-- dein GlowTTS-Wert
63
- preemphasis=0.0,
64
-
65
- # Stats
66
- stats_path=None,
67
- )
68
-
69
- # ===========================================================
70
- # HiFi-GAN v1 Config (Standard-Variante, beste Qualität)
71
- # ===========================================================
72
- config = HifiganConfig(
73
- run_name="hifigan_ljspeech",
74
- run_description="HiFi-GAN v1 from scratch, GlowTTS-compatible mels",
75
-
76
- # Daten
77
- data_path=data_path,
78
- output_path=output_path,
79
- eval_split_size=10,
80
-
81
- # Audio
82
- audio=audio_config,
83
-
84
- # Training
85
- epochs=2000, # bricht eh nach Steps; egal
86
- batch_size=32, # A6000: 32 ist sicherer Start; ggf. 48/64
87
- eval_batch_size=16,
88
- num_loader_workers=4,
89
- num_eval_loader_workers=2,
90
- run_eval=True,
91
- test_delay_epochs=5,
92
- mixed_precision=True,
93
-
94
- # Vocoder-spezifisch
95
- seq_len=8192, # Waveform-Snippet-Länge pro Sample
96
- pad_short=2000,
97
- use_noise_augment=True,
98
-
99
- # Optimizer
100
- lr_gen=2e-4, # HiFi-GAN-Paper-Default
101
- lr_disc=2e-4,
102
-
103
- # Logging / Saving
104
- print_step=50,
105
- print_eval=False,
106
- save_step=5000,
107
- save_n_checkpoints=5,
108
- save_checkpoints=True,
109
- log_model_step=10000,
110
- plot_step=500,
111
- )
112
-
113
- # AudioProcessor (passt sich aus config.audio an)
114
- ap = AudioProcessor(config=config.audio)
115
-
116
- # Daten laden
117
- eval_samples, train_samples = load_wav_data(
118
- config.data_path,
119
- config.eval_split_size,
120
- )
121
-
122
- # Modell
123
- model = GAN(config)
124
-
125
- # Trainer (kein restore_path = from scratch)
126
- trainer = Trainer(
127
- TrainerArgs(),
128
- config,
129
- output_path,
130
- model=model,
131
- train_samples=train_samples,
132
- eval_samples=eval_samples,
133
- training_assets={"audio_processor": ap},
134
- )
135
-
136
- trainer.fit()
137
-
138
-
139
- if __name__ == "__main__":
140
- main()
141
-
142
- ####################################################################################################################
143
-
144
- ⚠️ Wichtigster Schritt: Audio-Params verifizieren
145
- - Öffne deine alte GlowTTS config.json und vergleich mit dem audio_config oben.
146
- - Falls bei dir z.B. trim_db=60 oder ref_level_db=20 anders ist → hier anpassen.
147
- Ein einziger abweichender Wert = Vocoder lernt falsche Mel-Verteilung = wieder Müll-Sound.
148
- - Quick Check: python3 -c "import json; c=json.load(open('config.json')); print(json.dumps(c['audio'], indent=2))"
149
- --> Output mit dem audio_config oben abgleichen.
150
-
151
- STARTEN:
152
-
153
- screen -S vocoder
154
- PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True python3 train_vocoder.py
155
-
156
- INFERENCE:
157
-
158
- tts --text "Today, LH-Tech AI is introducing a new type of model: text to speech. You are currently listening to audio that was completely generated by Flare-TTS." \
159
- --model_path ./model.pth \
160
- --config_path ./config.json \
161
- --vocoder_path ./run-XXXXX/best_model.pth \
162
- --vocoder_config_path ./run-XXXXX/config.json \
163
- --out_path output_flare.wav
164
-
165
-
166
- ERWARTUNG:
167
-
168
- - 30 min (~10k Steps): Sprache erkennbar, noch rauschig
169
- - 2h (~50k Steps): brauchbar, deutlich besser als Griffin-Lim
170
- - 4–6h (~100–150k Steps): gute Qualität ✅
171
- - 10h+: production-grade