| from smolagents import DuckDuckGoSearchTool, VisitWebpageTool, SpeechToTextTool, FinalAnswerTool, PythonInterpreterTool, tool |
|
|
| from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor, pipeline |
| from qwen_vl_utils import process_vision_info |
| import torch |
|
|
| from typing import List, Any, Optional |
| from markdownify import markdownify |
| from tavily import TavilyClient |
|
|
| import os |
| import uuid |
| import json |
| import traceback |
| import requests |
| import datetime |
| import yt_dlp |
| import pandas as pd |
| import wikipedia as wiki |
| from bs4 import BeautifulSoup |
|
|
| import requests |
| from bs4 import BeautifulSoup |
| from markdownify import markdownify as md |
|
|
|
|
| @tool |
| def video_analyzer(file_path: str, query: str) -> str: |
| """ |
| |
| An artificial intelligence tool that takes as input a text string containing |
| the absolute path to a video file in MP4 format and a string with |
| a detailed text query to analyze the video. |
| |
| Args: |
| file_path: Absolute path to an Excel file. |
| query: detailed text query to analyze the video. |
| |
| Returns: |
| str: Row of text with the results of video file analysis |
| |
| Examples: |
| >>> video_analyzer("/test/1.mp4", "Identify separate bird species. What is the highest number of bird species to be on camera simultaneously?") |
| The video shows a group of Emperor penguins and a single Albatross. Therefore, the highest number of bird species to be on camera simultaneously is 2. |
| |
| """ |
|
|
| model = Qwen2_5_VLForConditionalGeneration.from_pretrained( |
| "Qwen/Qwen2.5-VL-3B-Instruct", torch_dtype="auto", device_map="auto" |
| ) |
|
|
| processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct") |
|
|
| text = "You are Qwen, created by Alibaba Cloud. You are a helpful assistant. " + query |
|
|
| messages = [ |
| { |
| "role": "user", |
| "content": [ |
| {"type": "video", "video": f"file://{file_path}", "fps": 1.0,}, |
| {"type": "text", "text": text}, |
| ], |
| } |
| ] |
|
|
| |
| text = processor.apply_chat_template( |
| messages, tokenize=False, add_generation_prompt=True |
| ) |
| image_inputs, video_inputs = process_vision_info(messages) |
| inputs = processor( |
| text=[text], |
| images=image_inputs, |
| videos=video_inputs, |
| padding=True, |
| return_tensors="pt", |
| ) |
| inputs = inputs.to("cuda") |
|
|
| |
| generated_ids = model.generate(**inputs, max_new_tokens=128) |
| generated_ids_trimmed = [ |
| out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) |
| ] |
| output_text = processor.batch_decode( |
| generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False |
| ) |
| |
| return output_text[0] |
|
|
|
|
| |
| @tool |
| def wikipedia_available_titles(query: str) -> List[str]: |
| """This insturment returns the titles of the articles available on wikipedia." |
| |
| Args: |
| query: str |
| The query that will be used to search for articles on wikipedia. |
| |
| Returns: |
| list : list of strings with available article titles |
| |
| """ |
| try: |
| wiki.set_rate_limiting(rate_limit=True, min_wait=datetime.timedelta(milliseconds=100)) |
| titles = wiki.search(query) |
| except Exception as e: |
| print("Exception occurred: ", e, "with query: ", query) |
|
|
| return titles |
| |
|
|
| @tool |
| def wikipedia_summary(title: str) -> str: |
| """This instrument returns the summary of a wikipedia article. |
| |
| Args: |
| title: str |
| The title of the wikipedia article to summarize. |
| |
| Returns: |
| str : The summary of the article. |
| """ |
| try: |
| wiki.set_rate_limiting(rate_limit=True, min_wait=datetime.timedelta(milliseconds=100)) |
| summary = wiki.summary(title, ) |
| except Exception as e: |
| print("Exception occurred: ", e, "with title: ", title) |
| summary = "" |
|
|
| return summary |
|
|
|
|
| @tool |
| def reverse_text(text: str) -> str: |
| """This tool returns a reversed string of text. |
| |
| Args: |
| text: str |
| The line of text to be reversed |
| |
| Returns: |
| str : Reversed line of text. |
| |
| Examples: |
| >>> reverse_text("ecnetnes siht dnatsrednu uoy fI") |
| If you understand this sentence |
| |
| """ |
| return text[::-1] |
|
|
|
|
| tavily_access_token = os.getenv("TAVILY_ACCESS_TOKEN") |
|
|
|
|
| @tool |
| def tavily_search(request: str) -> str: |
| """ |
| This is an ultimatum tool for finding information on the internet. |
| Don't use it to search YouTube! It's useless! |
| |
| Args: |
| request: A string containing a query to search in the Internet. |
| |
| Returns: |
| str: JSON string with execution results containing the following fields: |
| - query: The search query to execute with Tavily. |
| - answer: A short answer to the user's query, generated by an LLM. Included in the response only if include_answer is requested |
| - images: List of query-related images. If include_image_descriptions is true, each item will have url and description. |
| - results: A list of sorted search results, ranked by relevancy. Contains the following fields: |
| - title: The title of the search result. |
| - url: The URL of the search result. |
| - content: A short description of the search result. |
| - score: The relevance score of the search result. |
| - raw_content: The cleaned and parsed HTML content of the search result. Only if include_raw_content is true. |
| """ |
|
|
| client = TavilyClient(tavily_access_token) |
| response = client.search(query=request, include_raw_content=False, max_results=3, search_depth='advanced') |
|
|
| return response |
|
|
| @tool |
| def tavily_extract_web_page(url: str) -> str: |
| """ |
| This is an ultimatum tool that allows you to retrieve the contents of a web page. |
| In other words, to view the website. Don't use YouTube to extract pages! It's useless! |
| |
| Args: |
| url: The URL of the web page from which you want to retrieve information. |
| |
| Returns: |
| str: The parsed and cleaned HTML content of the web page. The raw content extracted. |
| """ |
|
|
| client = TavilyClient(tavily_access_token) |
| response = client.extract([url], extract_depth="advanced") |
|
|
| return response["results"][0]['raw_content'] |
|
|
|
|
| @tool |
| def download_youtube_video_audio(url: str) -> tuple[bool, str, str]: |
| """ |
| Downloads a YouTube video to a specified directory. Video and audio are downloaded separately. |
| The video is downloaded in mp4 format and the audio in mp3 format. |
| |
| Args: |
| url: The URL of the YouTube video. |
| |
| Returns: |
| Returns three strings: |
| bool: Execution result. True - success, False - error in file upload process. |
| str: The absolute path to the downloaded video file. |
| str: The absolute path to the downloaded audio file. |
| """ |
| try: |
| |
| guid = str(uuid.uuid4()) |
| output_dir="./downloads" |
|
|
| abs_output_dir = os.path.abspath(output_dir) |
|
|
| video_path = os.path.join(abs_output_dir, f"{guid}.mp4") |
| audio_path = os.path.join(abs_output_dir, f"{guid}.mp3") |
|
|
| format_priority = ( |
| 'bestvideo[height=360][ext=mp4]/' |
| 'bestvideo[height<360][ext=mp4]/' |
| 'worstvideo[height>=360]' |
| ) |
|
|
| video_options = { |
| 'format': format_priority, |
| 'outtmpl': video_path, |
| 'quiet': True, |
| 'no_warnings': True, |
| } |
|
|
| |
| audio_options = { |
| 'format': 'bestaudio/best[ext=mp3]', |
| 'outtmpl': audio_path, |
| 'quiet': True, |
| 'no_warnings': True, |
| } |
|
|
| |
| os.makedirs(output_dir, exist_ok=True) |
|
|
| |
| with yt_dlp.YoutubeDL(video_options) as ydl: |
| ydl.download([url]) |
| |
| with yt_dlp.YoutubeDL(audio_options) as ydl: |
| ydl.download([url]) |
| |
| return True, video_path, audio_path |
|
|
| except Exception as e: |
|
|
| |
| for path in [video_path, audio_path]: |
| try: |
| os.remove(path) |
| except: |
| pass |
| |
| return False, None, None |
| |
|
|
| @tool |
| def transcribe_audio_file(path: str) -> str: |
| """ |
| The tool takes as input the absolute path to the mp3 file to be transcribed and returns the English text. |
| |
| Args: |
| path: Absolute path to an audio file in mp3 format. |
| |
| Returns: |
| str: A string of transcripts of an audio file in English. |
| """ |
|
|
| device = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
|
| transcribe = pipeline( |
| "automatic-speech-recognition", |
| model="openai/whisper-base", |
| chunk_length_s=30, |
| batch_size=2, |
| device=device, |
| ) |
| try: |
| transcription = transcribe(path, batch_size=8, generate_kwargs={"language": "english", "task": "transcribe"})["text"] |
| except Exception as e: |
| print("ERROR: {e}, {path}") |
| traceback.print_exc() |
| return None |
|
|
| return transcription |
|
|
| @tool |
| def get_excel_data(file_path: str) -> pd.DataFrame: |
| """ |
| The tool takes as input an absolute path to the Excel file whose contents are to be output and returns a string of text with the contents of the file. |
| |
| Args: |
| file_path: Absolute path to an Excel file. |
| |
| Returns: |
| str: A row with the contents of an Excel file |
| """ |
| return str(pd.read_excel(file_path)) |
|
|
|
|
| @tool |
| def multiply(a: int, b: int) -> int: |
| """Multiply two numbers. |
| Args: |
| a: first int |
| b: second int |
| """ |
| return a * b |
|
|
| @tool |
| def add(a: int, b: int) -> int: |
| """Add two numbers. |
| |
| Args: |
| a: first int |
| b: second int |
| """ |
| return a + b |
|
|
| @tool |
| def subtract(a: int, b: int) -> int: |
| """Subtract two numbers. |
| |
| Args: |
| a: first int |
| b: second int |
| """ |
| return a - b |
|
|
| @tool |
| def divide(a: int, b: int) -> int: |
| """Divide two numbers. |
| |
| Args: |
| a: first int |
| b: second int |
| """ |
| if b == 0: |
| raise ValueError("Cannot divide by zero.") |
| return a / b |
|
|
| @tool |
| def modulus(a: int, b: int) -> int: |
| """Get the modulus of two numbers. |
| |
| Args: |
| a: first int |
| b: second int |
| """ |
| return a % b |
|
|
|
|
| available_tools = [ |
| reverse_text, |
| multiply, |
| add, |
| subtract, |
| divide, |
| modulus, |
| download_youtube_video_audio, |
| transcribe_audio_file, |
| get_excel_data, |
| wikipedia_available_titles, |
| wikipedia_summary, |
| video_analyzer, |
| FinalAnswerTool(), |
| DuckDuckGoSearchTool(), |
| tavily_search, |
| tavily_extract_web_page, |
| |
| PythonInterpreterTool(), |
| |
|
|
| ] |
|
|
|
|
| if __name__ == "__main__": |
| file = "/workspaces/Final_Assignment_Template/downloads/60cc887f-cb60-4fc6-88c8-a8bbc6a4659a.mp4" |
| text = "Identify separate bird species. What is the highest number of bird species to be on camera simultaneously?" |
|
|
| print(video_analyzer(file, text)) |