| from transformers import VitsModel, AutoTokenizer |
| import torch |
| import numpy as np |
| from scipy.io.wavfile import write |
|
|
| model = VitsModel.from_pretrained("../hindi-tts") |
| tokenizer = AutoTokenizer.from_pretrained("../hindi-tts") |
|
|
| text = "नमस्ते, आप कैसे हैं? मैं टैक्स ऑफिस से बोल रहा हूँ" |
| inputs = tokenizer(text, return_tensors="pt") |
| with torch.no_grad(): |
| output = model(**inputs).waveform |
|
|
| output = output.squeeze() |
| output_np = output.cpu().numpy() |
| output_int16 = (output_np * 32767).astype(np.int16) |
| write("hindi.wav", rate=model.config.sampling_rate, data=output_int16) |
|
|