import gradio as gr import time import base64 from datetime import datetime import argparse import os from agent import BrowserAgent from computers import BrowserbaseComputer, PlaywrightComputer import base64 import sys sys.path.append(os.path.dirname(__file__)) from prompt import prompt_options PLAYWRIGHT_SCREEN_SIZE = (1440, 900) os.environ["PLAYWRIGHT_HEADLESS"] = "false" def run_genflow(query:str, prompt:str = ""): env = PlaywrightComputer( screen_size=PLAYWRIGHT_SCREEN_SIZE, initial_url="https://wenku.baidu.com/ndcore/browse/aiunion?fr=options_AIcard_1&_wkts_=1761290807747&bdQuery=genflow&t=1761290807744&tabType=genflow&aiCreat=genflow", # initial_url="https://www.doubao.com/chat/", highlight_mouse=True, # 如果指定,代理将尝试在屏幕截图中突出显示鼠标光标的位置。这对于可视化调试很有用。 ) # query="在提示文本为'输入问题,交给GenFlow的'搜索条中输入‘百度’,点击发送按钮", prompt = "GenFlow是一个AI聊天机器人。你需要作为测试员,在它的网页底部搜索栏中输入用户提交的问题, 并且观察其输出结果。" + prompt prompt += """ GenFlow 返回的内容可能比较长,你可以多次执行'scroll_at'操作来查看网页中它返回的上下文。 GenFlow 有时会呈现左边主页面,右边预览区域。在这种情况下执行'scroll_at'的时候你要注意鼠标位置。 通过反复的滚动,确保主页面和预览区域已经滚动到底。预览区域往往很长,所以你需要多次执行'scroll_at'操作,确保2次滚动操作看到的网页完全一样为止, 给出最终客观评价. GenFlow 一定会给出'输出结果',请保持足够的耐心! **无论输入的语言是什么,你需要输出中文**""" # print(prompt) with env as browser_computer: agent = BrowserAgent( browser_computer=browser_computer, query=query, system_prompt=prompt, model_name='gemini-2.5-computer-use-preview-10-2025', ) # reasoning, status, function_responses_list for step in agent.agent_loop_yield(): # print(step) # input("☕️") yield step # === 处理用户查询 === def process_user_query(query, prompt=""): if not query.strip(): yield [], "请输入有效的查询", "
无执行步骤
" return steps = [] for reasoning, status, function_responses_list in run_genflow(query, prompt): # 每个 function_responses_list 是若干个 ["screenshot", "action", "response"] sub_steps = [] for item in function_responses_list: if len(item) == 3: screenshot_base64, action, response = item["screenshot"], item["action"], item["response"] else: screenshot_base64, action, response = "", "未知操作", "" sub_steps.append({ "screenshot": f"data:image/png;base64,{base64.b64encode(screenshot_base64).decode("utf-8")}" if screenshot_base64 else "", "action": action, "response": response, }) step = { "step": len(steps) + 1, "reasoning": reasoning, "status": status, "functions": sub_steps, # ✅ 支持多个函数结果 } steps.append(step) display_html = update_steps_display(steps) yield steps, f"正在执行第 {len(steps)} 步: {status}", display_html time.sleep(0.3) yield steps, f"任务完成!共执行 {len(steps)} 步。", update_steps_display(steps) # === 更新步骤展示的 HTML === def update_steps_display(steps): """以HTML格式显示每一步的推理、函数动作及截图""" if not steps: return "暂无执行步骤
" html = "推理: {step['reasoning']}
""" # ✅ 支持多个函数动作展示 for idx, func in enumerate(step["functions"], start=1): # print(func) html += f"""函数调用 {idx}:
动作: {func['action']}
返回: {func['response']}
""" if func["screenshot"]: html += f"输入一个任务描述,Agent 将自动进行 genflow 中浏览、截图、分析并执行下一步操作。