Final_Assignment_Template

Sleeping

App Files Files Community

Final_Assignment_Template / tools /web_tools.py

VivekMannava

Initial update

c3b34bf 11 months ago

raw

history blame

4.98 kB

	import asyncio
	import logging
	import os
	from typing import Any, Dict, List, Optional

	import aiohttp
	import html2text
	from griffe import json_decoder
	from llama_index.core import Settings, VectorStoreIndex
	from llama_index.core.schema import Document
	from llama_index.core.tools import FunctionTool
	from llama_index.embeddings.openai import OpenAIEmbedding
	from llama_index.llms.openai import OpenAI
	from llama_index.tools.tavily_research import TavilyToolSpec
	from llama_index.tools.wikipedia import WikipediaToolSpec
	from llama_index.readers.youtube_transcript import YoutubeTranscriptReader
	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Initialize OpenAI settings
	Settings.llm = OpenAI(model="gpt-4o", temperature=0.1)
	Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")



	async def fetch_and_process(
	urls: List[str], session: aiohttp.ClientSession, timeout: int = 10
	) -> List[Document]:
	"""Fetch and convert webpages to Document objects concurrently."""

	async def fetch(url: str) -> Dict[str, str]:
	try:
	async with session.get(url, timeout=timeout) as response:
	return {"text": await response.text(), "url": str(response.url)}
	except (asyncio.TimeoutError, aiohttp.ClientError) as e:
	logging.warning(f"Could not fetch {url}: {repr(e)}")
	return {"text": "", "url": url}

	tasks = [fetch(url) for url in urls]
	responses = await asyncio.gather(*tasks)

	return [
	Document(text=html2text.html2text(resp["text"]), id_=resp["url"])
	for resp in responses if resp["text"]
	]


	async def summarize_websites(urls: List[str], query: str) -> List[str]:
	"""Summarize a query from content across multiple websites. Even if there is only one website, it will still be used.

	Args:
	urls: A list of URLs to summarize.
	query: The query to summarize.
	Returns:
	A list of summaries.
	"""

	logging.info(f"Summarizing {len(urls)} websites for query: {query}")

	Settings.llm = OpenAI(model="gpt-4o-mini")
	Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

	summaries = []

	async with aiohttp.ClientSession() as session:
	documents = await fetch_and_process(urls, session)
	for doc in documents:
	index = VectorStoreIndex.from_documents([doc])
	result = index.as_query_engine().query(f"Summarize in very meticulous detail. {query}")
	summaries.append(f"Source: {doc.id_} \nContent: {result.response}")

	return summaries


	def tavily_search(query: str, max_results: Optional[int] = 10) -> List[Dict]:
	"""
	Tavily search with result formatting.
	Args:
	query: The query to search for.
	max_results: The maximum number of results to return.
	Returns:
	results: A list of dictionaries containing the results as URLS. Need to be used with the summarize_websites tool.
	"""
	logger.info(f"Called tavily_search for: {query}")

	try:
	search_engine = TavilyToolSpec(api_key=os.getenv("TAVILY_API_KEY"))
	search_results = search_engine.search(query, max_results=max_results)

	results = []
	for document in search_results:
	results.append({
	"url": document.metadata.get("url", ""),
	"content": f"Title: {document.metadata.get('title', '')}\nContent: {document.text}"
	})

	return results
	except Exception as e:
	logger.error(f"Tavily search failed: {str(e)}")
	return [{"error": f"Search failed: {str(e)}"}]


	def search_wikipedia(query: str, language: str = "en") -> str:
	"""
	Search Wikipedia for specific information. This is a more efficient way to search Wikipedia than the tavily_search tool. Need to be used with the summarize_websites tool.
	Args:
	query: The search query
	language: Wikipedia language code (default: "en")
	Returns:
	str: Wikipedia content summary
	"""
	logger.info(f"Searching Wikipedia for: {query}")

	try:
	wikipedia_tool = WikipediaToolSpec()
	search_results = wikipedia_tool.search_data(query, language=language)
	return search_results
	except Exception as e:
	logger.error(f"Wikipedia search failed: {str(e)}")
	return f"Wikipedia search failed: {str(e)}"

	def transcribe_youtube_video(video_url: str) -> str:
	"""Transcribe a YouTube video."""
	# get the video url
	reader = YoutubeTranscriptReader()
	transcript = reader.load_data(video_url)
	# return the transcript
	return transcript

	def get_web_tools():
	"""Return all available tools for the agent."""
	return [
	FunctionTool.from_defaults(summarize_websites),
	FunctionTool.from_defaults(tavily_search),
	FunctionTool.from_defaults(search_wikipedia),
	FunctionTool.from_defaults(transcribe_youtube_video),
	]