import json import threading from typing import Iterator import time import gradio as gr from groq import Groq from elevenlabs import play, stream from elevenlabs.client import ElevenLabs, AsyncElevenLabs import subprocess from utils.mapper import TOOLS, AVAILABLE_FUNCTIONS, COMMON_VARS # Initialize Groq client client = Groq(api_key="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") elevenlabs_client = ElevenLabs( api_key="sk_16d08614d675e9de0a89bdbff094c6332fceaafbb280f4b3" ) # elevenlabs_client = AsyncElevenLabs( # api_key="73d9a4f6d777e9224641e79aeb39dc50" # ) def text_to_speech_file(text: str, play_audio: bool) -> Iterator[bytes]: audio = elevenlabs_client.generate( text=text, voice="Adam", model="eleven_turbo_v2_5", stream=True, optimize_streaming_latency=3 ) if play_audio: print("streaming") stream(audio) return audio def create_content(result): additional_text = """Based on the user's request, follow these steps: 1. **Understand the Request:** Read the user's request carefully to determine the specific needs or actions required. 2. **Match with Functions:** Compare the user's request with the available functions in the tools. Identify which function aligns with the user's needs. 3. **Select the Best Function:** Choose the function that best matches the user's request. 4. **Call the Function:** Use the selected function from the tools to fulfill the user's request. """ content = ( f"""You are an AI assistant that will suggest and call the functions provided in the tools based on the user's request. You need to analyze the user's request and select the function from the provided tools that best matches the request and provide the results by calling the appropriate function. Expect all parameters from user request, Consider the dates according to the user query for example if user asking some operations for today, it should be understood to get today's date in YYYY-MM-DD format for date parameters. If required parameters are not in the user request these default can be used {COMMON_VARS} """ ) return content def background_task(audio_file_path, language, additional_text): with open(audio_file_path, "rb") as audio_file: # Make the API call response = client.audio.transcriptions.create( file=audio_file, model="whisper-large-v3", language=language ) transcribed_text = response.text result = f"Transcription: {transcribed_text}\n\nAdditional Context: {additional_text}\n\n" content = create_content(result) model = "llama3-groq-70b-8192-tool-use-preview" messages = [ { "role": "system", "content": content }, { "role": "user", "content": result } ] chat_completion = client.chat.completions.create( messages=messages, tools=TOOLS, tool_choice="auto", model=model, temperature=0.5, max_tokens=500, ) # thread.join() response_message = chat_completion.choices[0].message tool_calls = response_message.tool_calls if not tool_calls: raise Exception(f"No Tool Found associated with query: {transcribed_text}") messages.append(response_message) for tool_call in tool_calls: function_name = tool_call.function.name function_to_call = AVAILABLE_FUNCTIONS[function_name] function_args = json.loads(tool_call.function.arguments) function_response = function_to_call(**function_args) messages.append( { "tool_call_id": tool_call.id, "role": "tool", "name": function_name, "content": function_response, } ) second_response = client.chat.completions.create( model=model, messages=messages ) final_response = second_response.choices[0].message.content print("final: ", final_response) # Response Speech audio = text_to_speech_file(final_response, True) # stream(audio) # thread = threading.Thread(target=stream, args=(audio,)) # thread.start() # print(time.time() - start_time, "seconds") return final_response def play_audio(): bytes_data = open('greetings.mp3', 'rb').read() play(bytes_data) def transcribe_audio(audio_file_path, language, additional_text): try: # start_time = time.time() # thread = threading.Thread(target=background_task, args=(audio_file_path, language, additional_text,)) # thread.start() # text_to_speech_file("Sure.", True) # text_to_speech_file("Sure, let me get that for you from the CRM.", True) bytes_data = open('greetings.mp3', 'rb').read() play(bytes_data) # Open the audio file with open(audio_file_path, "rb") as audio_file: # Make the API call response = client.audio.transcriptions.create( file=audio_file, model="whisper-large-v3", language=language ) transcribed_text = response.text result = f"Transcription: {transcribed_text}\n\nAdditional Context: {additional_text}\n\n" content = create_content(result) model = "llama3-groq-70b-8192-tool-use-preview" messages = [ { "role": "system", "content": content }, { "role": "user", "content": result } ] chat_completion = client.chat.completions.create( messages=messages, tools=TOOLS, tool_choice="auto", model=model, temperature=0.5, max_tokens=500, ) # thread.join() response_message = chat_completion.choices[0].message tool_calls = response_message.tool_calls if not tool_calls: raise Exception(f"No Tool Found associated with query: {transcribed_text}") messages.append(response_message) for tool_call in tool_calls: function_name = tool_call.function.name function_to_call = AVAILABLE_FUNCTIONS[function_name] function_args = json.loads(tool_call.function.arguments) function_response = function_to_call(**function_args) messages.append( { "tool_call_id": tool_call.id, "role": "tool", "name": function_name, "content": function_response, } ) second_response = client.chat.completions.create( model=model, messages=messages ) final_response = second_response.choices[0].message.content print("final: ", final_response) # Response Speech # audio = text_to_speech_file(final_response, True) # stream(audio) thread = threading.Thread(target=text_to_speech_file, args=(final_response, True,)) thread.start() # print(time.time() - start_time, "seconds") return final_response except Exception as e: return f"An error occurred: {str(e)}" def speach_to_text(): # List of supported languages (this is an example, adjust based on Groq's actual supported languages) languages = ["en", "ba", "ms", "is", "no", "id"] # Create Gradio interface iface = gr.Interface( fn=transcribe_audio, inputs=[ gr.Audio(type="filepath", label="Upload Audio File"), gr.Dropdown(choices=languages, label="Select Language", value="en"), # gr.Radio(["standard", "high"], label="Transcription Quality", value="standard"), gr.Textbox(label="Additional Text", placeholder="Enter any additional context or instructions here...") ], outputs="text", title="Groq Speech-to-Text Transcription", description="Upload an audio file, set parameters, and provide additional text for context in the " "transcription process." ) # Launch the interface iface.launch() # Press the green button in the gutter to run the script. if __name__ == '__main__': import platform print(f"platform: {platform.platform()}") print(f"platform: {platform.freedesktop_os_release()}") if "Linux" in platform.platform(): # subprocess.run(['apt-get', 'install', '-y', 'snapd']) # subprocess.run(['snap', 'install', '-y', 'mpv']) subprocess.run(['apt', 'install', '-y', 'mpv']) speach_to_text()