Spaces:
Sleeping
Sleeping
File size: 4,557 Bytes
8c162e4 546a399 8c162e4 99b8231 8c162e4 99b8231 8c162e4 2bcf72e cbbbfb4 eb84627 cbbbfb4 68a5482 cbbbfb4 eb84627 cbbbfb4 99b8231 68a5482 cbbbfb4 eb84627 68a5482 bea827c cbbbfb4 bea827c 3f5d44e bea827c cbbbfb4 99b8231 cbbbfb4 eb84627 cbbbfb4 85dd0e2 cbbbfb4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 | import math
from typing import Optional, Tuple, Literal
from smolagents import tool
@tool
def extract_text_from_audio(file_path : str) -> str:
"""given a path to an audio file, it extract and returns the text contained in it as a string"""
import speech_recognition as sr
r = sr.Recognizer()
with sr.AudioFile(file_path) as source:
# listen for the data (load audio to memory)
audio_data = r.record(source)
# recognize (convert from speech to text)
text = r.recognize_google(audio_data)
return text
@tool
def extract_text_from_audio(file_path: str) -> str:
"""
Extract and return text transcription from an audio file using speech recognition.
This tool uses Google's speech recognition API to convert spoken audio content
into text. It supports various audio formats including WAV, AIFF, and FLAC
(formats supported by the SpeechRecognition library).
Args:
file_path (str): Path to the audio file to be transcribed. The file should
be in a format compatible with the SpeechRecognition library.
Returns:
str: The extracted text content from the audio file.
Raises:
Exception : the exception
Examples:
>>> extract_text_from_audio("meeting_recording.wav")
"Hello team, welcome to our weekly meeting..."
>>> extract_text_from_audio("/path/to/audio/interview.mp3")
"Could you please introduce yourself and your background?"
"""
import speech_recognition as sr
r = sr.Recognizer()
try:
with sr.AudioFile(file_path) as source:
# listen for the data (load audio to memory)
audio_data = r.record(source)
# recognize (convert from speech to text)
text = r.recognize_google(audio_data)
return text
except Exception as e:
return e
class TestAgent:
def __init__(self):
# import code agent and basic tool from smolagent
from smolagents import CodeAgent, OpenAIServerModel, DuckDuckGoSearchTool, FinalAnswerTool, VisitWebpageTool, MCPClient
# import additional tool from langchain @ https://docs.langchain.com/oss/python/integrations/tools
#from langchain_community.agent_toolkits import load_tools
from langchain_community.agent_toolkits.load_tools import load_tools
from smolagents import Tool
wikipedia_tool = Tool.from_langchain(load_tools(["wikipedia"])[0])
wikipedia_tool.top_k_results=3
# import tools from MCP servers @ https://github.com/mcp
#from mcp import StdioServerParameters
#server_parameters = StdioServerParameters(command="uvx",
# args=["--quiet", "youtubeqa@0.2.1"],
# env={"UV_PYTHON": "3.12", **os.environ},
# )
#youtube_tools = MCPServerTool(server_params=server_parameters)
model = OpenAIServerModel(model_id="gpt-4o")
#model = InferenceClientModel("Qwen/Qwen2.5-Coder-32B-Instruct")
# Instantiate the agent
self.agent = CodeAgent(
tools=[extract_text_from_audio, # homemade tool
DuckDuckGoSearchTool(), # basic tools from smolagent
VisitWebpageTool(),
wikipedia_tool, # tool from langchain with extra parmaeters
#youtube_tools, # tool from MCP server
FinalAnswerTool()],
additional_authorized_imports=["pandas","markdownify","requests"], # V2 add markdownify & requests
model=model,
max_steps=4, # V3 increase steps
planning_interval=2, # V3 add structure
verbosity_level=2,
use_structured_outputs_internally=True # V3. Adds structure
)
# V3. add Guidance
prompt_for_guidance = "\n10. Provide the answer axactly as it is asked, be concise and precise\n\nNow Begin!"
self.agent.prompt_templates['system_prompt'] = self.agent.prompt_templates['system_prompt'] + prompt_for_guidance
def __call__(self, question: str) -> str:
print(f"Agent received question (first 50 chars): {question[:50]}...")
answer = self.agent.run(question)
print(f"Agent returning his answer: {answer}")
return answer
|