When I used this model, I discovered two issues.

#1
by Chris7v7 - opened

Since the model card does not mention voice, I chose the original version of Tara.
1.When I attempt to generate a Thai phrase, the audio at the end includes a long stretch of current noise without any speech.
2. When I attempt to generate a longer Thai sentence, the audio starts abnormally, plays normally in the middle, but becomes abnormal again towards the end.
this is my code:

from orpheus_tts import OrpheusModel
import wave
import time

def main():
    model = OrpheusModel(model_name ="kadirnar/Orpheus-Porjai-Thai")
    prompt = '''ฉันอยากไปเที่ยวเชียงใหม่'''

    start_time = time.monotonic()
    syn_tokens = model.generate_speech(
    prompt=prompt,
    voice="tara",
    )

    with wave.open("thai.wav", "wb") as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(24000)

        total_frames = 0
        chunk_counter = 0
        for audio_chunk in syn_tokens: # output streaming
            chunk_counter += 1
            frame_count = len(audio_chunk) // (wf.getsampwidth() * wf.getnchannels())
            total_frames += frame_count
            wf.writeframes(audio_chunk)
        duration = total_frames / wf.getframerate()

        end_time = time.monotonic()
        print(f"It took {end_time - start_time} seconds to generate {duration:.2f} seconds of audio")


if __name__ == "__main__":
    main()

Sign up or log in to comment