Final_Assignment_GregLigon

Sleeping

App Files Files Community

Final_Assignment_GregLigon / app.py

GregPLigon

Update app.py

f9e706d verified 14 days ago

raw

history blame contribute delete

13.4 kB

	import os
	import re
	import threading
	import gradio as gr
	import requests
	import pandas as pd
	from smolagents import ToolCallingAgent, DuckDuckGoSearchTool, VisitWebpageTool, LiteLLMModel

	# --- Constants ---
	DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"


	# ============================================================================
	# ANSWER CLEANUP
	# Strips explanatory text so the submitted answer is bare and exact-match ready.
	# ============================================================================

	def clean_answer(raw: str) -> str:
	"""
	Extract the bare answer from whatever the agent returned.
	Handles common patterns where the model adds preamble/postamble.
	"""
	if not raw:
	return "unknown"

	text = raw.strip()

	# Remove markdown bold/italic
	text = re.sub(r'\*+', '', text)

	# If it starts with a code fence, extract the content
	code_fence = re.search(r'```(?:python)?\s(.?)\s*```', text, re.DOTALL)
	if code_fence:
	text = code_fence.group(1).strip()

	# Strip [ANSWER] tags if present
	answer_tag = re.search(r'\[ANSWER\]\s(.)', text, re.DOTALL)
	if answer_tag:
	text = answer_tag.group(1).strip()

	# If the text is a single short line already, return it directly
	lines = [l.strip() for l in text.splitlines() if l.strip()]
	if len(lines) == 1:
	return lines[0]

	# Look for "Thoughts: ... \n <answer>" pattern — take the last non-empty line
	# but only if it looks like a bare answer (short, no sentence structure)
	if lines:
	last_line = lines[-1]
	# If the last line is short and doesn't look like a sentence, use it
	if len(last_line) < 100 and not last_line.endswith(('.', '?', '!')):
	return last_line
	# If the last line ends with punctuation but is short, still use it
	if len(last_line) < 50:
	return last_line

	# Fallback: return the full stripped text
	return text.strip()


	# ============================================================================
	# AGENT DEFINITION
	# ============================================================================

	class GAIAAgent:
	def __init__(self):
	api_key = os.environ.get("GEMINI_API_KEY")
	if not api_key:
	raise ValueError("GEMINI_API_KEY not set in Space secrets")

	# ToolCallingAgent uses JSON tool calls — compatible with how
	# Gemini 2.5 Flash responds (no code block requirement)
	model = LiteLLMModel(
	model_id="gemini/gemini-2.5-flash",
	api_key=api_key,
	num_retries=0,
	temperature=0.0,
	max_tokens=2048,
	)

	self.agent = ToolCallingAgent(
	model=model,
	tools=[
	DuckDuckGoSearchTool(),
	VisitWebpageTool(),
	],
	max_steps=6,
	)

	self.agent.prompt_templates["system_prompt"] = """You are a GAIA benchmark assistant. Your only job is to produce the single correct answer to a question.

	Reply with ONLY the final answer — no explanation, no reasoning, no preamble, no extra words whatsoever.

	Rules:
	- Numbers: use digits (e.g. 4, not "four") UNLESS the question explicitly asks for the number written as a word
	- No units unless the question explicitly asks for them
	- Lists: comma-separated, sorted alphabetically unless another order is specified
	- Omit articles ("a", "an", "the") unless they are part of a proper noun or title
	- Dates: use the format the question implies; if unspecified, use YYYY-MM-DD
	- If the answer cannot be determined, reply with exactly: unknown

	Examples:
	Q: What is 2 + 2?
	A: 4

	Q: How many studio albums did Mercedes Sosa release between 2000 and 2009 (inclusive)?
	A: 5

	Q: List the planets in our solar system.
	A: Earth, Jupiter, Mars, Mercury, Neptune, Saturn, Uranus, Venus
	"""

	def __call__(self, question: str) -> str:
	result_container = [None]
	error_container = [None]

	def run_agent():
	try:
	result_container[0] = self.agent.run(question)
	except Exception as e:
	error_container[0] = str(e)

	thread = threading.Thread(target=run_agent)
	thread.start()
	thread.join(timeout=180) # 3 minutes max per question

	if thread.is_alive():
	print(f" Question timed out: {question[:80]}...")
	return "unknown"
	elif error_container[0]:
	print(f" Agent error: {error_container[0]}")
	return f"AGENT ERROR: {error_container[0]}"
	else:
	raw = str(result_container[0]).strip() if result_container[0] is not None else "unknown"
	cleaned = clean_answer(raw)
	if cleaned != raw:
	print(f" Answer cleaned: {repr(raw[:80])} -> {repr(cleaned[:80])}")
	return cleaned


	# ============================================================================
	# EVALUATION & SUBMISSION
	# ============================================================================

	def run_and_submit_all(profile: gr.OAuthProfile \| None):
	"""
	Fetches all questions, runs the GAIAAgent on them (downloading any
	attached files), submits all answers, and displays the results.
	"""
	space_id = os.getenv("SPACE_ID")

	if profile:
	username = f"{profile.username}"
	print(f"User logged in: {username}")
	else:
	print("User not logged in.")
	return "Please Login to Hugging Face with the button.", None

	api_url = DEFAULT_API_URL
	questions_url = f"{api_url}/questions"
	submit_url = f"{api_url}/submit"

	# 1. Instantiate Agent
	try:
	agent = GAIAAgent()
	except Exception as e:
	print(f"Error instantiating agent: {e}")
	return f"Error initializing agent: {e}", None

	agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
	print(f"Agent code link: {agent_code}")

	# 2. Fetch Questions
	print(f"Fetching questions from: {questions_url}")
	try:
	response = requests.get(questions_url, timeout=15)
	response.raise_for_status()
	questions_data = response.json()
	if not questions_data:
	print("Fetched questions list is empty.")
	return "Fetched questions list is empty or invalid format.", None
	print(f"Fetched {len(questions_data)} questions.")
	except requests.exceptions.RequestException as e:
	print(f"Error fetching questions: {e}")
	return f"Error fetching questions: {e}", None
	except requests.exceptions.JSONDecodeError as e:
	print(f"Error decoding JSON response from questions endpoint: {e}")
	return f"Error decoding server response for questions: {e}", None
	except Exception as e:
	print(f"An unexpected error occurred fetching questions: {e}")
	return f"An unexpected error occurred fetching questions: {e}", None

	# 3. Run Agent on each question
	results_log = []
	answers_payload = []
	print(f"Running agent on {len(questions_data)} questions...")

	for item in questions_data:
	task_id = item.get("task_id")
	question_text = item.get("question")
	file_name = item.get("file_name")

	if not task_id or question_text is None:
	print(f"Skipping item with missing task_id or question: {item}")
	continue

	print(f" Working on task {task_id}...")

	# Download attached file if one exists
	if file_name:
	try:
	file_url = f"{api_url}/files/{task_id}"
	file_response = requests.get(file_url, timeout=30)
	file_response.raise_for_status()
	file_path = f"/tmp/{file_name}"
	with open(file_path, "wb") as f:
	f.write(file_response.content)
	question_text = (
	f"{question_text}\n\n"
	f"[An attached file for this task has been saved to: {file_path}]"
	)
	print(f" Downloaded attachment for task {task_id}: {file_name}")
	except Exception as e:
	print(f" Could not fetch file for task {task_id}: {e}")

	# Run the agent
	try:
	submitted_answer = agent(question_text)
	answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
	results_log.append({
	"Task ID": task_id,
	"Question": question_text,
	"Submitted Answer": submitted_answer
	})
	print(f" Task {task_id} answered: {submitted_answer[:80]}")
	except Exception as e:
	print(f"Error running agent on task {task_id}: {e}")
	results_log.append({
	"Task ID": task_id,
	"Question": question_text,
	"Submitted Answer": f"AGENT ERROR: {e}"
	})

	if not answers_payload:
	print("Agent did not produce any answers to submit.")
	return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)

	# 4. Submit
	submission_data = {
	"username": username.strip(),
	"agent_code": agent_code,
	"answers": answers_payload
	}
	print(f"Submitting {len(answers_payload)} answers for user '{username}'...")

	try:
	response = requests.post(submit_url, json=submission_data, timeout=300)
	response.raise_for_status()
	result_data = response.json()
	final_status = (
	f"Submission Successful!\n"
	f"User: {result_data.get('username')}\n"
	f"Overall Score: {result_data.get('score', 'N/A')}% "
	f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
	f"Message: {result_data.get('message', 'No message received.')}"
	)
	print("Submission successful.")
	return final_status, pd.DataFrame(results_log)
	except requests.exceptions.HTTPError as e:
	error_detail = f"Server responded with status {e.response.status_code}."
	try:
	error_json = e.response.json()
	error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
	except requests.exceptions.JSONDecodeError:
	error_detail += f" Response: {e.response.text[:500]}"
	status_message = f"Submission Failed: {error_detail}"
	print(status_message)
	return status_message, pd.DataFrame(results_log)
	except requests.exceptions.Timeout:
	status_message = "Submission Failed: The request timed out."
	print(status_message)
	return status_message, pd.DataFrame(results_log)
	except requests.exceptions.RequestException as e:
	status_message = f"Submission Failed: Network error - {e}"
	print(status_message)
	return status_message, pd.DataFrame(results_log)
	except Exception as e:
	status_message = f"An unexpected error occurred during submission: {e}"
	print(status_message)
	return status_message, pd.DataFrame(results_log)


	# ============================================================================
	# GRADIO INTERFACE
	# ============================================================================

	with gr.Blocks() as demo:
	gr.Markdown("# GAIA Benchmark Agent")
	gr.Markdown(
	"""
	Instructions:

	1. Make sure your `GEMINI_API_KEY` is set in Settings → Variables and secrets.
	2. Log in to your Hugging Face account using the button below.
	3. Click Run Evaluation & Submit All Answers to fetch all 20 questions, run the
	agent on each one, submit your answers, and see your score.

	---
	*Note: This typically takes 20–40 minutes to complete all 20 questions. Keep this
	tab open and active — do not let your computer sleep during the run.*
	"""
	)

	gr.LoginButton()

	run_button = gr.Button("Run Evaluation & Submit All Answers", variant="primary")

	status_output = gr.Textbox(
	label="Run Status / Submission Result",
	lines=5,
	interactive=False
	)
	results_table = gr.DataFrame(
	label="Questions and Agent Answers",
	wrap=True
	)

	run_button.click(
	fn=run_and_submit_all,
	outputs=[status_output, results_table]
	)

	if __name__ == "__main__":
	print("\n" + "-" * 30 + " App Starting " + "-" * 30)

	space_host_startup = os.getenv("SPACE_HOST")
	space_id_startup = os.getenv("SPACE_ID")

	if space_host_startup:
	print(f"✅ SPACE_HOST found: {space_host_startup}")
	print(f" Runtime URL should be: https://{space_host_startup}.hf.space")
	else:
	print("ℹ️ SPACE_HOST environment variable not found (running locally?).")

	if space_id_startup:
	print(f"✅ SPACE_ID found: {space_id_startup}")
	print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
	print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
	else:
	print("ℹ️ SPACE_ID environment variable not found (running locally?).")

	print("-" * (60 + len(" App Starting ")) + "\n")
	print("Launching Gradio Interface for GAIA Agent Evaluation...")
	demo.launch(debug=True, share=False)