File size: 4,557 Bytes
8c162e4
546a399
8c162e4
 
 
 
 
99b8231
 
8c162e4
99b8231
 
 
 
 
 
 
 
8c162e4
 
2bcf72e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cbbbfb4
 
 
 
eb84627
cbbbfb4
 
68a5482
 
 
cbbbfb4
 
 
 
 
eb84627
 
 
 
 
 
cbbbfb4
 
 
 
 
99b8231
68a5482
 
cbbbfb4
eb84627
68a5482
bea827c
cbbbfb4
bea827c
 
3f5d44e
bea827c
cbbbfb4
99b8231
 
 
cbbbfb4
 
eb84627
cbbbfb4
85dd0e2
cbbbfb4
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import math
from typing import Optional, Tuple, Literal
from smolagents import tool



@tool
def extract_text_from_audio(file_path : str) -> str:
    """given a path to an audio file, it extract and returns the text contained in it as a string"""

    import speech_recognition as sr
    r = sr.Recognizer()
    with sr.AudioFile(file_path) as source:
        # listen for the data (load audio to memory)
        audio_data = r.record(source)
        # recognize (convert from speech to text)
        text = r.recognize_google(audio_data)
    return text


@tool
def extract_text_from_audio(file_path: str) -> str:
    """
    Extract and return text transcription from an audio file using speech recognition.
    
    This tool uses Google's speech recognition API to convert spoken audio content
    into text. It supports various audio formats including WAV, AIFF, and FLAC
    (formats supported by the SpeechRecognition library).
    
    Args:
        file_path (str): Path to the audio file to be transcribed. The file should
                        be in a format compatible with the SpeechRecognition library.
    
    Returns:
        str: The extracted text content from the audio file.
    
    Raises:
        Exception : the exception 
    
    Examples:
        >>> extract_text_from_audio("meeting_recording.wav")
        "Hello team, welcome to our weekly meeting..."
        
        >>> extract_text_from_audio("/path/to/audio/interview.mp3")
        "Could you please introduce yourself and your background?"
    """
    
    import speech_recognition as sr
    r = sr.Recognizer()
    try:
        with sr.AudioFile(file_path) as source:
            # listen for the data (load audio to memory)
            audio_data = r.record(source)
            # recognize (convert from speech to text)
            text = r.recognize_google(audio_data)
        return text
    except Exception as e:
        return e

    
class TestAgent:
    def __init__(self):
        
        # import code agent and basic tool from smolagent
        from smolagents import CodeAgent, OpenAIServerModel, DuckDuckGoSearchTool, FinalAnswerTool, VisitWebpageTool, MCPClient

        # import additional tool from langchain @ https://docs.langchain.com/oss/python/integrations/tools
        #from langchain_community.agent_toolkits import load_tools
        from langchain_community.agent_toolkits.load_tools import load_tools

        from smolagents import Tool
        wikipedia_tool = Tool.from_langchain(load_tools(["wikipedia"])[0])
        wikipedia_tool.top_k_results=3

        # import tools from MCP servers @ https://github.com/mcp
        #from mcp import StdioServerParameters
        #server_parameters = StdioServerParameters(command="uvx",
        #                                          args=["--quiet", "youtubeqa@0.2.1"],
        #                                          env={"UV_PYTHON": "3.12", **os.environ},
        #                                         )
        #youtube_tools = MCPServerTool(server_params=server_parameters)

        model = OpenAIServerModel(model_id="gpt-4o")
        #model = InferenceClientModel("Qwen/Qwen2.5-Coder-32B-Instruct")
        # Instantiate the agent
        self.agent = CodeAgent(
            tools=[extract_text_from_audio,        # homemade tool
                   DuckDuckGoSearchTool(),          # basic tools from smolagent
                   VisitWebpageTool(),
                   wikipedia_tool,                  # tool from langchain with extra parmaeters
                   #youtube_tools,                   # tool from MCP server
                   FinalAnswerTool()],
            additional_authorized_imports=["pandas","markdownify","requests"],    # V2 add markdownify & requests
            model=model,
            max_steps=4,                              # V3 increase steps
            planning_interval=2,                      # V3 add structure
            verbosity_level=2,
            use_structured_outputs_internally=True   # V3. Adds structure
        )
        # V3. add Guidance
        prompt_for_guidance = "\n10. Provide the answer axactly as it is asked, be concise and precise\n\nNow Begin!"
        self.agent.prompt_templates['system_prompt'] = self.agent.prompt_templates['system_prompt'] + prompt_for_guidance

    def __call__(self, question: str) -> str:

        print(f"Agent received question (first 50 chars): {question[:50]}...")
        answer = self.agent.run(question)
        print(f"Agent returning his answer: {answer}")
        return answer