Spaces:

Bas95
/

tripplanner-backend

Sleeping

App Files Files Community

tripplanner-backend / backend /tools /browser.py

Bas95

Deploy Tripplanner backend

649703e verified 13 days ago

raw

history blame contribute delete

16.7 kB

	"""
	BrowserSession — async Playwright wrapper for the Tripplanner browser-automation layer.

	AI subagents (claude-haiku with tool_use) call the exposed methods to navigate
	delta.com, ihg.com, and resy.com. A persistent Chromium profile keeps users
	logged in between runs.

	Usage:
	async with BrowserSession(user_data_dir="/tmp/tripplanner-profile") as browser:
	await browser.navigate("https://www.delta.com")
	text = await browser.get_page_text()
	"""

	from __future__ import annotations

	import base64
	import os
	from pathlib import Path
	from typing import Any

	from playwright.async_api import async_playwright, BrowserContext, Page, TimeoutError as PWTimeoutError


	class BrowserSession:
	"""Async context manager wrapping a Chromium browser.

	On Render (or any headless environment), set BROWSER_USER_DATA_DIR to a
	persistent disk path and upload sessions.json via /api/session/upload.
	Locally, run scripts/session_setup.py once to log in interactively.
	"""

	def __init__(
	self,
	user_data_dir: str,
	headless: bool \| None = None,
	viewport: dict[str, int] \| None = None,
	slow_mo: float = 0,
	) -> None:
	self._user_data_dir = user_data_dir
	# Auto-detect headless: if no DISPLAY env var, force headless (i.e. on Render)
	if headless is None:
	self._headless = not bool(os.getenv("DISPLAY", "")) or os.getenv("RENDER") == "true"
	else:
	self._headless = headless
	self._viewport = viewport or {"width": 1280, "height": 900}
	self._slow_mo = slow_mo

	self._pw = None
	self._context: BrowserContext \| None = None
	self._page: Page \| None = None

	def _session_file(self) -> Path \| None:
	"""Return path to sessions.json if available.

	Priority:
	1. BROWSER_SESSION_JSON env var (base64-encoded) — for HF Spaces / ephemeral envs
	2. sessions.json on disk (uploaded via /api/session/upload)
	"""
	# 1. Decode from env var (HF Spaces stores secrets as env vars)
	b64 = os.getenv("BROWSER_SESSION_JSON", "")
	if b64:
	out = Path(self._user_data_dir) / "sessions.json"
	out.parent.mkdir(parents=True, exist_ok=True)
	out.write_bytes(base64.b64decode(b64))
	return out

	# 2. File on disk (Render persistent disk or local upload)
	p = Path(self._user_data_dir) / "sessions.json"
	return p if p.exists() else None

	# ------------------------------------------------------------------
	# Async context manager
	# ------------------------------------------------------------------

	async def __aenter__(self) -> "BrowserSession":
	self._pw = await async_playwright().start()
	session_file = self._session_file()

	if session_file:
	# Render / deployed: use a regular (non-persistent) context with saved storage state.
	browser = await self._pw.chromium.launch(
	headless=self._headless,
	slow_mo=self._slow_mo,
	)
	self._context = await browser.new_context(
	storage_state=str(session_file),
	viewport=self._viewport,
	)
	else:
	# Local: use a persistent profile so the user stays logged in.
	self._context = await self._pw.chromium.launch_persistent_context(
	self._user_data_dir,
	headless=self._headless,
	viewport=self._viewport,
	slow_mo=self._slow_mo,
	)

	if self._context.pages:
	self._page = self._context.pages[0]
	else:
	self._page = await self._context.new_page()
	return self

	async def __aexit__(self, *_: Any) -> None:
	if self._context:
	await self._context.close()
	if self._pw:
	await self._pw.stop()

	# ------------------------------------------------------------------
	# Browser action methods
	# ------------------------------------------------------------------

	async def navigate(self, url: str) -> str:
	"""Navigate to a URL and return the page title."""
	try:
	await self._page.goto(url, wait_until="domcontentloaded")
	title = await self._page.title()
	return title or "(no title)"
	except Exception as exc:
	return f"Error: {exc}"

	async def click(self, selector: str \| None = None, text: str \| None = None) -> str:
	"""Click an element by CSS selector OR by visible text (not both).

	Returns a confirmation string or an error message.
	"""
	try:
	if selector and text:
	return "Error: provide selector OR text, not both."
	if selector:
	await self._page.click(selector)
	return f"Clicked selector: {selector}"
	if text:
	await self._page.get_by_text(text, exact=False).first.click()
	return f"Clicked element with text: {text!r}"
	return "Error: provide selector or text."
	except Exception as exc:
	return f"Error: {exc}"

	async def fill(self, selector: str, value: str) -> str:
	"""Clear and fill an input field, then return confirmation."""
	try:
	await self._page.fill(selector, value)
	return f"Filled {selector!r} with value."
	except Exception as exc:
	return f"Error: {exc}"

	async def get_text(self, selector: str, limit: int = 20) -> list[str]:
	"""Return the inner text of all elements matching selector, up to limit items."""
	try:
	texts = await self._page.locator(selector).all_inner_texts()
	return texts[:limit]
	except Exception as exc:
	return [f"Error: {exc}"]

	async def get_page_text(self) -> str:
	"""Return visible page text (body), truncated to 8000 characters."""
	try:
	text = await self._page.inner_text("body")
	return text[:8000]
	except Exception as exc:
	return f"Error: {exc}"

	async def wait_for(self, selector: str, timeout: int = 10000) -> bool:
	"""Wait until selector appears in the DOM. Returns True on success, False on timeout."""
	try:
	await self._page.wait_for_selector(selector, timeout=timeout)
	return True
	except PWTimeoutError:
	return False
	except Exception:
	return False

	async def select_option(self, selector: str, value: str) -> str:
	"""Select an <option> by value in a <select> element."""
	try:
	await self._page.select_option(selector, value=value)
	return f"Selected option {value!r} in {selector!r}."
	except Exception as exc:
	return f"Error: {exc}"

	async def press_key(self, key: str) -> str:
	"""Send a keyboard key to the focused element (e.g. 'Enter', 'Tab')."""
	try:
	await self._page.keyboard.press(key)
	return f"Pressed key: {key}"
	except Exception as exc:
	return f"Error: {exc}"

	async def scroll_down(self) -> str:
	"""Scroll the page down by ~800 px."""
	try:
	await self._page.mouse.wheel(0, 800)
	return "Scrolled down."
	except Exception as exc:
	return f"Error: {exc}"

	async def screenshot_base64(self) -> str:
	"""Take a full-page screenshot and return it as a base64-encoded PNG string."""
	try:
	png_bytes = await self._page.screenshot(full_page=True)
	return base64.b64encode(png_bytes).decode()
	except Exception as exc:
	return f"Error: {exc}"

	# ------------------------------------------------------------------
	# Anthropic tool definitions
	# ------------------------------------------------------------------

	@classmethod
	def tool_definitions(cls) -> list[dict]:
	"""Return Anthropic-compatible tool definitions for all browser methods.

	Pass the result directly to ``client.messages.create(tools=...)``.
	"""
	return [
	{
	"name": "navigate",
	"description": "Navigate the browser to a URL and return the page title.",
	"input_schema": {
	"type": "object",
	"properties": {
	"url": {
	"type": "string",
	"description": "The full URL to navigate to (e.g. 'https://www.delta.com').",
	}
	},
	"required": ["url"],
	},
	},
	{
	"name": "click",
	"description": (
	"Click an element on the page. Provide EITHER 'selector' (CSS) "
	"OR 'text' (visible text match), not both."
	),
	"input_schema": {
	"type": "object",
	"properties": {
	"selector": {
	"type": "string",
	"description": "CSS selector of the element to click.",
	},
	"text": {
	"type": "string",
	"description": "Visible text of the element to click.",
	},
	},
	"required": [],
	},
	},
	{
	"name": "fill",
	"description": "Clear and type a value into an input or textarea element.",
	"input_schema": {
	"type": "object",
	"properties": {
	"selector": {
	"type": "string",
	"description": "CSS selector of the input element.",
	},
	"value": {
	"type": "string",
	"description": "The text to enter into the field.",
	},
	},
	"required": ["selector", "value"],
	},
	},
	{
	"name": "get_text",
	"description": (
	"Return the inner text of all elements matching a CSS selector, "
	"up to 'limit' results."
	),
	"input_schema": {
	"type": "object",
	"properties": {
	"selector": {
	"type": "string",
	"description": "CSS selector to match elements.",
	},
	"limit": {
	"type": "integer",
	"description": "Maximum number of text strings to return (default 20).",
	},
	},
	"required": ["selector"],
	},
	},
	{
	"name": "get_page_text",
	"description": "Return the full visible text of the current page, truncated to 8000 characters.",
	"input_schema": {
	"type": "object",
	"properties": {},
	"required": [],
	},
	},
	{
	"name": "wait_for",
	"description": (
	"Wait for a CSS selector to appear in the DOM. "
	"Returns true on success, false on timeout."
	),
	"input_schema": {
	"type": "object",
	"properties": {
	"selector": {
	"type": "string",
	"description": "CSS selector to wait for.",
	},
	"timeout": {
	"type": "integer",
	"description": "Maximum wait time in milliseconds (default 10000).",
	},
	},
	"required": ["selector"],
	},
	},
	{
	"name": "select_option",
	"description": "Select an option by value in a <select> element.",
	"input_schema": {
	"type": "object",
	"properties": {
	"selector": {
	"type": "string",
	"description": "CSS selector of the <select> element.",
	},
	"value": {
	"type": "string",
	"description": "The option value to select.",
	},
	},
	"required": ["selector", "value"],
	},
	},
	{
	"name": "press_key",
	"description": "Send a keyboard key press to the focused element (e.g. 'Enter', 'Tab', 'Escape').",
	"input_schema": {
	"type": "object",
	"properties": {
	"key": {
	"type": "string",
	"description": "The key to press (Playwright key name, e.g. 'Enter', 'Tab').",
	}
	},
	"required": ["key"],
	},
	},
	{
	"name": "scroll_down",
	"description": "Scroll the current page down by approximately 800 pixels.",
	"input_schema": {
	"type": "object",
	"properties": {},
	"required": [],
	},
	},
	{
	"name": "screenshot_base64",
	"description": "Take a full-page screenshot and return it as a base64-encoded PNG string (for debugging).",
	"input_schema": {
	"type": "object",
	"properties": {},
	"required": [],
	},
	},
	]

	# ------------------------------------------------------------------
	# Tool dispatcher
	# ------------------------------------------------------------------

	async def dispatch_tool(self, name: str, input: dict) -> str:
	"""Dispatch a tool call by name to the matching method.

	Returns the result as a string (serialised if necessary).
	Unknown names and bad inputs are caught and returned as error strings.
	"""
	_dispatch: dict[str, Any] = {
	"navigate": self.navigate,
	"click": self.click,
	"fill": self.fill,
	"get_text": self.get_text,
	"get_page_text": self.get_page_text,
	"wait_for": self.wait_for,
	"select_option": self.select_option,
	"press_key": self.press_key,
	"scroll_down": self.scroll_down,
	"screenshot_base64": self.screenshot_base64,
	}

	method = _dispatch.get(name)
	if method is None:
	return f"Error: unknown tool '{name}'."

	try:
	result = await method(**input)
	# Coerce non-string results (bool, list) to strings for uniform AI consumption.
	if isinstance(result, list):
	return "\n".join(str(item) for item in result)
	return str(result)
	except TypeError as exc:
	return f"Error: bad arguments for '{name}': {exc}"
	except Exception as exc:
	return f"Error: {exc}"

	# Aliases used by subagents
	async def execute_tool(self, name: str, input: dict) -> str:
	return await self.dispatch_tool(name, input)

	@property
	def tools(self) -> list[dict]:
	return BrowserSession.tool_definitions()

	def get_tool_definitions(self) -> list[dict]:
	return BrowserSession.tool_definitions()

	@property
	def gemini_tools(self):
	"""Return google-genai Tool objects for all browser methods."""
	from google.genai import types as gtypes
	return [gtypes.Tool(function_declarations=[
	gtypes.FunctionDeclaration(
	name=t["name"],
	description=t["description"],
	parameters=t["input_schema"],
	)
	for t in BrowserSession.tool_definitions()
	])]