PatnaikAshish commited on
Commit
8da48d8
·
verified ·
1 Parent(s): 864ce87

Update core/cloner.py

Browse files
Files changed (1) hide show
  1. core/cloner.py +146 -11
core/cloner.py CHANGED
@@ -1,12 +1,21 @@
 
 
1
  import os
 
2
  import tempfile
 
 
 
 
3
  import torch
4
  import soundfile as sf
5
  from huggingface_hub import hf_hub_download
6
  from kanade_tokenizer import KanadeModel, load_audio, load_vocoder, vocode
7
  from kokoro_onnx import Kokoro
 
8
  from misaki import espeak
9
  from misaki.espeak import EspeakG2P
 
10
 
11
  class KokoClone:
12
  def __init__(self, kanade_model="frothywater/kanade-12.5hz", hf_repo="PatnaikAshish/kokoclone"):
@@ -25,6 +34,71 @@ class KokoClone:
25
  # Cache for Kokoro
26
  self.kokoro_cache = {}
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  def _ensure_file(self, folder, filename):
29
  """Auto-downloads missing models from your Hugging Face repo."""
30
  filepath = os.path.join(folder, filename)
@@ -39,12 +113,24 @@ class KokoClone:
39
  )
40
  return filepath
41
 
 
 
 
 
 
 
 
 
 
 
 
42
  def _get_config(self, lang):
43
  """Routes the correct model, voice, and G2P based on language."""
44
  model_file = self._ensure_file("model", "kokoro.onnx")
45
  voices_file = self._ensure_file("voice", "voices-v1.0.bin")
46
  vocab = None
47
  g2p = None
 
48
 
49
  # Optimized routing: Only load the specific G2P engine requested
50
  if lang == "en":
@@ -72,30 +158,55 @@ class KokoClone:
72
  # FIX: Auto-download the Japanese dictionary if it's missing!
73
  if not os.path.exists(unidic.DICDIR):
74
  print("Downloading missing Japanese dictionary (this takes a minute but only happens once)...")
75
- subprocess.run(["python", "-m", "unidic", "download"], check=True)
76
 
77
  g2p = ja.JAG2P()
78
  voice = "jf_alpha"
79
- vocab = self._ensure_file("model", "config.json")
 
 
80
  elif lang == "zh":
81
  from misaki import zh
82
- g2p = zh.ZHG2P(version="1.1")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  voice = "zf_001"
84
  model_file = self._ensure_file("model", "kokoro-v1.1-zh.onnx")
85
  voices_file = self._ensure_file("voice", "voices-v1.1-zh.bin")
86
- vocab = self._ensure_file("model", "config.json")
87
  else:
88
  raise ValueError(f"Language '{lang}' not supported.")
89
 
90
- return model_file, voices_file, vocab, g2p, voice
91
 
92
  def generate(self, text, lang, reference_audio, output_path="output.wav"):
93
  """Generates the speech and applies the target voice."""
94
- model_file, voices_file, vocab, g2p, voice = self._get_config(lang)
95
 
96
  # 1. Kokoro TTS Phase
97
  if model_file not in self.kokoro_cache:
98
- self.kokoro_cache[model_file] = Kokoro(model_file, voices_file, vocab_config=vocab) if vocab else Kokoro(model_file, voices_file)
 
99
 
100
  kokoro = self.kokoro_cache[model_file]
101
 
@@ -119,12 +230,36 @@ class KokoClone:
119
  ref_wav = load_audio(reference_audio, sample_rate=self.sample_rate).to(self.device)
120
 
121
  with torch.inference_mode():
122
- converted_mel = self.kanade.voice_conversion(source_waveform=source_wav, reference_waveform=ref_wav)
123
- converted_wav = vocode(self.vocoder, converted_mel.unsqueeze(0))
 
 
 
 
 
124
 
125
- sf.write(output_path, converted_wav.squeeze().cpu().numpy(), self.sample_rate)
126
  print(f"Success! Saved: {output_path}")
127
 
128
  finally:
129
  if os.path.exists(temp_path):
130
- os.remove(temp_path) # Clean up temp file silently
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib.resources
2
+ import json
3
  import os
4
+ import sys
5
  import tempfile
6
+ import time
7
+ import types
8
+
9
+ import numpy as np
10
  import torch
11
  import soundfile as sf
12
  from huggingface_hub import hf_hub_download
13
  from kanade_tokenizer import KanadeModel, load_audio, load_vocoder, vocode
14
  from kokoro_onnx import Kokoro
15
+ from kokoro_onnx.config import MAX_PHONEME_LENGTH, SAMPLE_RATE
16
  from misaki import espeak
17
  from misaki.espeak import EspeakG2P
18
+ from core.chunked_convert import chunked_voice_conversion
19
 
20
  class KokoClone:
21
  def __init__(self, kanade_model="frothywater/kanade-12.5hz", hf_repo="PatnaikAshish/kokoclone"):
 
34
  # Cache for Kokoro
35
  self.kokoro_cache = {}
36
 
37
+ def _get_vocab_config(self, lang):
38
+ """Return a vocab config path compatible with the selected language/model."""
39
+ # zh/ja model exports use the v1.1-zh vocabulary from hexgrad.
40
+ if lang in {"zh", "ja"}:
41
+ zh_vocab = os.path.join("model", "config-v1.1-zh.json")
42
+ if not os.path.exists(zh_vocab):
43
+ print("Downloading missing file 'config-v1.1-zh.json' from hexgrad/Kokoro-82M-v1.1-zh...")
44
+ hf_hub_download(
45
+ repo_id="hexgrad/Kokoro-82M-v1.1-zh",
46
+ filename="config.json",
47
+ local_dir=".",
48
+ )
49
+ downloaded = os.path.join("config.json")
50
+ if os.path.exists(downloaded):
51
+ os.replace(downloaded, zh_vocab)
52
+
53
+ if os.path.exists(zh_vocab):
54
+ return zh_vocab
55
+
56
+ local_config = os.path.join("model", "config.json")
57
+ if os.path.exists(local_config):
58
+ try:
59
+ with open(local_config, encoding="utf-8") as fp:
60
+ config = json.load(fp)
61
+ if isinstance(config, dict) and "vocab" in config:
62
+ return local_config
63
+ print("Warning: model/config.json is missing 'vocab'; using packaged kokoro_onnx config instead")
64
+ except (OSError, json.JSONDecodeError) as exc:
65
+ print(f"Warning: could not read model/config.json ({exc}); using packaged kokoro_onnx config instead")
66
+
67
+ return str(importlib.resources.files("kokoro_onnx").joinpath("config.json"))
68
+
69
+ def _patch_kokoro_compat(self, kokoro):
70
+ """Patch kokoro_onnx instances for model exports with mixed input conventions."""
71
+ input_types = {input_meta.name: input_meta.type for input_meta in kokoro.sess.get_inputs()}
72
+ if input_types.get("speed") != "tensor(float)" or "input_ids" not in input_types:
73
+ return kokoro
74
+
75
+ def _create_audio_compat(instance, phonemes, voice, speed):
76
+ if len(phonemes) > MAX_PHONEME_LENGTH:
77
+ phonemes = phonemes[:MAX_PHONEME_LENGTH]
78
+
79
+ start_t = time.time()
80
+ tokens = np.array(instance.tokenizer.tokenize(phonemes), dtype=np.int64)
81
+ assert len(tokens) <= MAX_PHONEME_LENGTH, (
82
+ f"Context length is {MAX_PHONEME_LENGTH}, but leave room for the pad token 0 at the start & end"
83
+ )
84
+
85
+ voice_style = voice[len(tokens)]
86
+ inputs = {
87
+ "input_ids": [[0, *tokens, 0]],
88
+ "style": np.array(voice_style, dtype=np.float32),
89
+ "speed": np.array([speed], dtype=np.float32),
90
+ }
91
+
92
+ audio = instance.sess.run(None, inputs)[0]
93
+ audio_duration = len(audio) / SAMPLE_RATE
94
+ create_duration = time.time() - start_t
95
+ if audio_duration > 0:
96
+ _ = create_duration / audio_duration
97
+ return audio, SAMPLE_RATE
98
+
99
+ kokoro._create_audio = types.MethodType(_create_audio_compat, kokoro)
100
+ return kokoro
101
+
102
  def _ensure_file(self, folder, filename):
103
  """Auto-downloads missing models from your Hugging Face repo."""
104
  filepath = os.path.join(folder, filename)
 
113
  )
114
  return filepath
115
 
116
+ def _create_en_callable(self):
117
+ """Create an English G2P callable for handling English tokens in non-English text."""
118
+ en_g2p = EspeakG2P(language="en-us")
119
+ def en_callable(text):
120
+ try:
121
+ phonemes, _ = en_g2p(text)
122
+ return phonemes
123
+ except Exception:
124
+ return text
125
+ return en_callable
126
+
127
  def _get_config(self, lang):
128
  """Routes the correct model, voice, and G2P based on language."""
129
  model_file = self._ensure_file("model", "kokoro.onnx")
130
  voices_file = self._ensure_file("voice", "voices-v1.0.bin")
131
  vocab = None
132
  g2p = None
133
+ en_callable = None
134
 
135
  # Optimized routing: Only load the specific G2P engine requested
136
  if lang == "en":
 
158
  # FIX: Auto-download the Japanese dictionary if it's missing!
159
  if not os.path.exists(unidic.DICDIR):
160
  print("Downloading missing Japanese dictionary (this takes a minute but only happens once)...")
161
+ subprocess.run([sys.executable, "-m", "unidic", "download"], check=True)
162
 
163
  g2p = ja.JAG2P()
164
  voice = "jf_alpha"
165
+ vocab = self._get_vocab_config(lang)
166
+ # Provide English fallback for mixed Japanese-English text
167
+ en_callable = self._create_en_callable()
168
  elif lang == "zh":
169
  from misaki import zh
170
+ import re
171
+
172
+ base_g2p = zh.ZHG2P(version="1.1")
173
+ en_callable = self._create_en_callable()
174
+
175
+ # Wrap ZHG2P to handle English tokens in mixed Chinese-English text.
176
+ def mixed_g2p(text):
177
+ # Split on English words/names and process them separately
178
+ parts = re.split(r'([a-zA-Z]+)', text)
179
+ phonemes_list = []
180
+ for part in parts:
181
+ if part and part[0].isalpha() and part[0].isascii():
182
+ # English token: use English G2P
183
+ phonemes_list.append(en_callable(part))
184
+ else:
185
+ # Chinese token: use Chinese G2P
186
+ if part:
187
+ ph, _ = base_g2p(part)
188
+ phonemes_list.append(ph)
189
+ result = "".join(phonemes_list)
190
+ return result, text
191
+
192
+ g2p = mixed_g2p
193
  voice = "zf_001"
194
  model_file = self._ensure_file("model", "kokoro-v1.1-zh.onnx")
195
  voices_file = self._ensure_file("voice", "voices-v1.1-zh.bin")
196
+ vocab = self._get_vocab_config(lang)
197
  else:
198
  raise ValueError(f"Language '{lang}' not supported.")
199
 
200
+ return model_file, voices_file, vocab, g2p, voice, en_callable
201
 
202
  def generate(self, text, lang, reference_audio, output_path="output.wav"):
203
  """Generates the speech and applies the target voice."""
204
+ model_file, voices_file, vocab, g2p, voice, en_callable = self._get_config(lang)
205
 
206
  # 1. Kokoro TTS Phase
207
  if model_file not in self.kokoro_cache:
208
+ kokoro = Kokoro(model_file, voices_file, vocab_config=vocab) if vocab else Kokoro(model_file, voices_file)
209
+ self.kokoro_cache[model_file] = self._patch_kokoro_compat(kokoro)
210
 
211
  kokoro = self.kokoro_cache[model_file]
212
 
 
230
  ref_wav = load_audio(reference_audio, sample_rate=self.sample_rate).to(self.device)
231
 
232
  with torch.inference_mode():
233
+ converted_wav = chunked_voice_conversion(
234
+ kanade=self.kanade,
235
+ vocoder_model=self.vocoder,
236
+ source_wav=source_wav,
237
+ ref_wav=ref_wav,
238
+ sample_rate=self.sample_rate
239
+ )
240
 
241
+ sf.write(output_path, converted_wav.numpy(), self.sample_rate)
242
  print(f"Success! Saved: {output_path}")
243
 
244
  finally:
245
  if os.path.exists(temp_path):
246
+ os.remove(temp_path) # Clean up temp file silently
247
+
248
+ def convert(self, source_audio, reference_audio, output_path="output.wav"):
249
+ """Re-voices source_audio to sound like reference_audio using chunking."""
250
+ print("Applying Voice Conversion...")
251
+ # Load and push to device
252
+ source_wav = load_audio(source_audio, sample_rate=self.sample_rate).to(self.device)
253
+ ref_wav = load_audio(reference_audio, sample_rate=self.sample_rate).to(self.device)
254
+
255
+ with torch.inference_mode():
256
+ converted_wav = chunked_voice_conversion(
257
+ kanade=self.kanade,
258
+ vocoder_model=self.vocoder,
259
+ source_wav=source_wav,
260
+ ref_wav=ref_wav,
261
+ sample_rate=self.sample_rate
262
+ )
263
+
264
+ sf.write(output_path, converted_wav.numpy(), self.sample_rate)
265
+ print(f"Success! Saved: {output_path}")