Spaces:
Build error
Build error
Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .DS_Store +0 -0
- .gitattributes +27 -0
- .github/workflows/main.yaml +49 -0
- .gitignore +177 -0
- .gradio/certificate.pem +31 -0
- CONTRIBUTING.md +33 -0
- LICENSE +202 -0
- README.md +140 -7
- agent.py +568 -0
- app.py +200 -0
- cache/.DS_Store +0 -0
- cache/user_data/state.json +0 -0
- computers/__init__.py +23 -0
- computers/browserbase/__init__.py +0 -0
- computers/browserbase/browserbase.py +80 -0
- computers/computer.py +125 -0
- computers/playwright/__init__.py +0 -0
- computers/playwright/playwright.py +430 -0
- genflow.py +178 -0
- logs/.DS_Store +0 -0
- logs/screenshot_20251103_212455.png +3 -0
- logs/screenshot_20251103_212504.png +3 -0
- logs/screenshot_20251103_212514.png +3 -0
- logs/screenshot_20251103_212605.png +3 -0
- logs/screenshot_20251103_212612.png +3 -0
- logs/screenshot_20251103_212613.png +3 -0
- logs/screenshot_20251103_212614.png +0 -0
- logs/screenshot_20251103_212622.png +3 -0
- logs/screenshot_20251103_212631.png +3 -0
- logs/screenshot_20251103_212732.png +3 -0
- logs/screenshot_20251103_212738.png +3 -0
- logs/screenshot_20251103_212739.png +3 -0
- logs/screenshot_20251103_212740.png +0 -0
- logs/screenshot_20251103_212750.png +0 -0
- logs/screenshot_20251103_212759.png +3 -0
- logs/screenshot_20251103_212809.png +3 -0
- logs/screenshot_20251103_212949.png +3 -0
- logs/screenshot_20251103_212956.png +3 -0
- logs/screenshot_20251103_212957.png +3 -0
- logs/screenshot_20251103_212958.png +0 -0
- logs/screenshot_20251103_213005.png +0 -0
- logs/screenshot_20251103_213032.png +3 -0
- logs/screenshot_20251103_213042.png +3 -0
- logs/screenshot_20251103_213052.png +3 -0
- logs/screenshot_20251104_155252.png +3 -0
- logs/screenshot_20251104_155301.png +3 -0
- logs/screenshot_20251104_155302.png +3 -0
- logs/screenshot_20251104_155303.png +0 -0
- logs/screenshot_20251104_155330.png +3 -0
- logs/screenshot_20251104_155340.png +3 -0
.DS_Store
ADDED
|
Binary file (10.2 kB). View file
|
|
|
.gitattributes
CHANGED
|
@@ -33,3 +33,30 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
logs/screenshot_20251103_212455.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
logs/screenshot_20251103_212504.png filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
logs/screenshot_20251103_212514.png filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
logs/screenshot_20251103_212605.png filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
logs/screenshot_20251103_212612.png filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
logs/screenshot_20251103_212613.png filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
logs/screenshot_20251103_212622.png filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
logs/screenshot_20251103_212631.png filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
logs/screenshot_20251103_212732.png filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
logs/screenshot_20251103_212738.png filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
logs/screenshot_20251103_212739.png filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
logs/screenshot_20251103_212759.png filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
logs/screenshot_20251103_212809.png filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
logs/screenshot_20251103_212949.png filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
logs/screenshot_20251103_212956.png filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
logs/screenshot_20251103_212957.png filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
logs/screenshot_20251103_213032.png filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
logs/screenshot_20251103_213042.png filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
logs/screenshot_20251103_213052.png filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
logs/screenshot_20251104_155252.png filter=lfs diff=lfs merge=lfs -text
|
| 56 |
+
logs/screenshot_20251104_155301.png filter=lfs diff=lfs merge=lfs -text
|
| 57 |
+
logs/screenshot_20251104_155302.png filter=lfs diff=lfs merge=lfs -text
|
| 58 |
+
logs/screenshot_20251104_155330.png filter=lfs diff=lfs merge=lfs -text
|
| 59 |
+
logs/screenshot_20251104_155340.png filter=lfs diff=lfs merge=lfs -text
|
| 60 |
+
logs/screenshot_20251106_144427.png filter=lfs diff=lfs merge=lfs -text
|
| 61 |
+
logs/screenshot_20251106_144436.png filter=lfs diff=lfs merge=lfs -text
|
| 62 |
+
logs/screenshot_20251106_144437.png filter=lfs diff=lfs merge=lfs -text
|
.github/workflows/main.yaml
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2025 Google LLC
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
name: CI Tests
|
| 15 |
+
|
| 16 |
+
on:
|
| 17 |
+
push:
|
| 18 |
+
branches:
|
| 19 |
+
- main
|
| 20 |
+
pull_request:
|
| 21 |
+
branches:
|
| 22 |
+
- main
|
| 23 |
+
|
| 24 |
+
jobs:
|
| 25 |
+
test-pytest:
|
| 26 |
+
runs-on: ubuntu-latest
|
| 27 |
+
|
| 28 |
+
strategy:
|
| 29 |
+
matrix:
|
| 30 |
+
python-version: ["3.10", "3.11"]
|
| 31 |
+
|
| 32 |
+
steps:
|
| 33 |
+
- name: Check out repository
|
| 34 |
+
uses: actions/checkout@v4
|
| 35 |
+
|
| 36 |
+
- name: Set up Python ${{ matrix.python-version }}
|
| 37 |
+
uses: actions/setup-python@v5
|
| 38 |
+
with:
|
| 39 |
+
python-version: ${{ matrix.python-version }}
|
| 40 |
+
|
| 41 |
+
- name: Install dependencies
|
| 42 |
+
run: |
|
| 43 |
+
python -m pip install --upgrade pip
|
| 44 |
+
pip install -r requirements.txt
|
| 45 |
+
|
| 46 |
+
- name: Run pytest
|
| 47 |
+
run: |
|
| 48 |
+
pytest
|
| 49 |
+
|
.gitignore
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
share/python-wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
MANIFEST
|
| 28 |
+
|
| 29 |
+
# PyInstaller
|
| 30 |
+
# Usually these files are written by a python script from a template
|
| 31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 32 |
+
*.manifest
|
| 33 |
+
*.spec
|
| 34 |
+
|
| 35 |
+
# Installer logs
|
| 36 |
+
pip-log.txt
|
| 37 |
+
pip-delete-this-directory.txt
|
| 38 |
+
|
| 39 |
+
# Unit test / coverage reports
|
| 40 |
+
htmlcov/
|
| 41 |
+
.tox/
|
| 42 |
+
.nox/
|
| 43 |
+
.coverage
|
| 44 |
+
.coverage.*
|
| 45 |
+
.cache
|
| 46 |
+
nosetests.xml
|
| 47 |
+
coverage.xml
|
| 48 |
+
*.cover
|
| 49 |
+
*.py,cover
|
| 50 |
+
.hypothesis/
|
| 51 |
+
.pytest_cache/
|
| 52 |
+
cover/
|
| 53 |
+
|
| 54 |
+
# Translations
|
| 55 |
+
*.mo
|
| 56 |
+
*.pot
|
| 57 |
+
|
| 58 |
+
# Django stuff:
|
| 59 |
+
*.log
|
| 60 |
+
local_settings.py
|
| 61 |
+
db.sqlite3
|
| 62 |
+
db.sqlite3-journal
|
| 63 |
+
|
| 64 |
+
# Flask stuff:
|
| 65 |
+
instance/
|
| 66 |
+
.webassets-cache
|
| 67 |
+
|
| 68 |
+
# Scrapy stuff:
|
| 69 |
+
.scrapy
|
| 70 |
+
|
| 71 |
+
# Sphinx documentation
|
| 72 |
+
docs/_build/
|
| 73 |
+
|
| 74 |
+
# PyBuilder
|
| 75 |
+
.pybuilder/
|
| 76 |
+
target/
|
| 77 |
+
|
| 78 |
+
# Jupyter Notebook
|
| 79 |
+
.ipynb_checkpoints
|
| 80 |
+
|
| 81 |
+
# IPython
|
| 82 |
+
profile_default/
|
| 83 |
+
ipython_config.py
|
| 84 |
+
|
| 85 |
+
# pyenv
|
| 86 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 88 |
+
# .python-version
|
| 89 |
+
|
| 90 |
+
# pipenv
|
| 91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 94 |
+
# install all needed dependencies.
|
| 95 |
+
#Pipfile.lock
|
| 96 |
+
|
| 97 |
+
# UV
|
| 98 |
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
| 99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 100 |
+
# commonly ignored for libraries.
|
| 101 |
+
#uv.lock
|
| 102 |
+
|
| 103 |
+
# poetry
|
| 104 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 105 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 106 |
+
# commonly ignored for libraries.
|
| 107 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 108 |
+
#poetry.lock
|
| 109 |
+
|
| 110 |
+
# pdm
|
| 111 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 112 |
+
#pdm.lock
|
| 113 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
| 114 |
+
# in version control.
|
| 115 |
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
| 116 |
+
.pdm.toml
|
| 117 |
+
.pdm-python
|
| 118 |
+
.pdm-build/
|
| 119 |
+
|
| 120 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 121 |
+
__pypackages__/
|
| 122 |
+
|
| 123 |
+
# Celery stuff
|
| 124 |
+
celerybeat-schedule
|
| 125 |
+
celerybeat.pid
|
| 126 |
+
|
| 127 |
+
# SageMath parsed files
|
| 128 |
+
*.sage.py
|
| 129 |
+
|
| 130 |
+
# Environments
|
| 131 |
+
.env
|
| 132 |
+
.venv
|
| 133 |
+
env/
|
| 134 |
+
venv/
|
| 135 |
+
ENV/
|
| 136 |
+
env.bak/
|
| 137 |
+
venv.bak/
|
| 138 |
+
|
| 139 |
+
# Spyder project settings
|
| 140 |
+
.spyderproject
|
| 141 |
+
.spyproject
|
| 142 |
+
|
| 143 |
+
# Rope project settings
|
| 144 |
+
.ropeproject
|
| 145 |
+
|
| 146 |
+
# mkdocs documentation
|
| 147 |
+
/site
|
| 148 |
+
|
| 149 |
+
# mypy
|
| 150 |
+
.mypy_cache/
|
| 151 |
+
.dmypy.json
|
| 152 |
+
dmypy.json
|
| 153 |
+
|
| 154 |
+
# Pyre type checker
|
| 155 |
+
.pyre/
|
| 156 |
+
|
| 157 |
+
# pytype static type analyzer
|
| 158 |
+
.pytype/
|
| 159 |
+
|
| 160 |
+
# Cython debug symbols
|
| 161 |
+
cython_debug/
|
| 162 |
+
|
| 163 |
+
# PyCharm
|
| 164 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 165 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 166 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 167 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 168 |
+
#.idea/
|
| 169 |
+
|
| 170 |
+
# Ruff stuff:
|
| 171 |
+
.ruff_cache/
|
| 172 |
+
|
| 173 |
+
# PyPI configuration file
|
| 174 |
+
.pypirc
|
| 175 |
+
|
| 176 |
+
# Node stuff
|
| 177 |
+
node_modules/
|
.gradio/certificate.pem
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-----BEGIN CERTIFICATE-----
|
| 2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
| 3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
| 4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
| 5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
| 6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
| 7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
| 8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
| 9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
| 10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
| 11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
| 12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
| 13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
| 14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
| 15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
| 16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
| 17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
| 18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
| 19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
| 20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
| 21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
| 22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
| 23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
| 24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
| 25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
| 26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
| 27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
| 28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
| 29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
| 30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
| 31 |
+
-----END CERTIFICATE-----
|
CONTRIBUTING.md
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# How to contribute
|
| 2 |
+
|
| 3 |
+
We'd love to accept your patches and contributions to this project.
|
| 4 |
+
|
| 5 |
+
## Before you begin
|
| 6 |
+
|
| 7 |
+
### Sign our Contributor License Agreement
|
| 8 |
+
|
| 9 |
+
Contributions to this project must be accompanied by a
|
| 10 |
+
[Contributor License Agreement](https://cla.developers.google.com/about) (CLA).
|
| 11 |
+
You (or your employer) retain the copyright to your contribution; this simply
|
| 12 |
+
gives us permission to use and redistribute your contributions as part of the
|
| 13 |
+
project.
|
| 14 |
+
|
| 15 |
+
If you or your current employer have already signed the Google CLA (even if it
|
| 16 |
+
was for a different project), you probably don't need to do it again.
|
| 17 |
+
|
| 18 |
+
Visit <https://cla.developers.google.com/> to see your current agreements or to
|
| 19 |
+
sign a new one.
|
| 20 |
+
|
| 21 |
+
### Review our community guidelines
|
| 22 |
+
|
| 23 |
+
This project follows
|
| 24 |
+
[Google's Open Source Community Guidelines](https://opensource.google/conduct/).
|
| 25 |
+
|
| 26 |
+
## Contribution process
|
| 27 |
+
|
| 28 |
+
### Code reviews
|
| 29 |
+
|
| 30 |
+
All submissions, including submissions by project members, require review. We
|
| 31 |
+
use GitHub pull requests for this purpose. Consult
|
| 32 |
+
[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
|
| 33 |
+
information on using pull requests.
|
LICENSE
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
Apache License
|
| 3 |
+
Version 2.0, January 2004
|
| 4 |
+
http://www.apache.org/licenses/
|
| 5 |
+
|
| 6 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
| 7 |
+
|
| 8 |
+
1. Definitions.
|
| 9 |
+
|
| 10 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
| 11 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
| 12 |
+
|
| 13 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
| 14 |
+
the copyright owner that is granting the License.
|
| 15 |
+
|
| 16 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
| 17 |
+
other entities that control, are controlled by, or are under common
|
| 18 |
+
control with that entity. For the purposes of this definition,
|
| 19 |
+
"control" means (i) the power, direct or indirect, to cause the
|
| 20 |
+
direction or management of such entity, whether by contract or
|
| 21 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
| 22 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
| 23 |
+
|
| 24 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
| 25 |
+
exercising permissions granted by this License.
|
| 26 |
+
|
| 27 |
+
"Source" form shall mean the preferred form for making modifications,
|
| 28 |
+
including but not limited to software source code, documentation
|
| 29 |
+
source, and configuration files.
|
| 30 |
+
|
| 31 |
+
"Object" form shall mean any form resulting from mechanical
|
| 32 |
+
transformation or translation of a Source form, including but
|
| 33 |
+
not limited to compiled object code, generated documentation,
|
| 34 |
+
and conversions to other media types.
|
| 35 |
+
|
| 36 |
+
"Work" shall mean the work of authorship, whether in Source or
|
| 37 |
+
Object form, made available under the License, as indicated by a
|
| 38 |
+
copyright notice that is included in or attached to the work
|
| 39 |
+
(an example is provided in the Appendix below).
|
| 40 |
+
|
| 41 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
| 42 |
+
form, that is based on (or derived from) the Work and for which the
|
| 43 |
+
editorial revisions, annotations, elaborations, or other modifications
|
| 44 |
+
represent, as a whole, an original work of authorship. For the purposes
|
| 45 |
+
of this License, Derivative Works shall not include works that remain
|
| 46 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
| 47 |
+
the Work and Derivative Works thereof.
|
| 48 |
+
|
| 49 |
+
"Contribution" shall mean any work of authorship, including
|
| 50 |
+
the original version of the Work and any modifications or additions
|
| 51 |
+
to that Work or Derivative Works thereof, that is intentionally
|
| 52 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
| 53 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
| 54 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
| 55 |
+
means any form of electronic, verbal, or written communication sent
|
| 56 |
+
to the Licensor or its representatives, including but not limited to
|
| 57 |
+
communication on electronic mailing lists, source code control systems,
|
| 58 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
| 59 |
+
Licensor for the purpose of discussing and improving the Work, but
|
| 60 |
+
excluding communication that is conspicuously marked or otherwise
|
| 61 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
| 62 |
+
|
| 63 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
| 64 |
+
on behalf of whom a Contribution has been received by Licensor and
|
| 65 |
+
subsequently incorporated within the Work.
|
| 66 |
+
|
| 67 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
| 68 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 69 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 70 |
+
copyright license to reproduce, prepare Derivative Works of,
|
| 71 |
+
publicly display, publicly perform, sublicense, and distribute the
|
| 72 |
+
Work and such Derivative Works in Source or Object form.
|
| 73 |
+
|
| 74 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
| 75 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 76 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 77 |
+
(except as stated in this section) patent license to make, have made,
|
| 78 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
| 79 |
+
where such license applies only to those patent claims licensable
|
| 80 |
+
by such Contributor that are necessarily infringed by their
|
| 81 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
| 82 |
+
with the Work to which such Contribution(s) was submitted. If You
|
| 83 |
+
institute patent litigation against any entity (including a
|
| 84 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
| 85 |
+
or a Contribution incorporated within the Work constitutes direct
|
| 86 |
+
or contributory patent infringement, then any patent licenses
|
| 87 |
+
granted to You under this License for that Work shall terminate
|
| 88 |
+
as of the date such litigation is filed.
|
| 89 |
+
|
| 90 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
| 91 |
+
Work or Derivative Works thereof in any medium, with or without
|
| 92 |
+
modifications, and in Source or Object form, provided that You
|
| 93 |
+
meet the following conditions:
|
| 94 |
+
|
| 95 |
+
(a) You must give any other recipients of the Work or
|
| 96 |
+
Derivative Works a copy of this License; and
|
| 97 |
+
|
| 98 |
+
(b) You must cause any modified files to carry prominent notices
|
| 99 |
+
stating that You changed the files; and
|
| 100 |
+
|
| 101 |
+
(c) You must retain, in the Source form of any Derivative Works
|
| 102 |
+
that You distribute, all copyright, patent, trademark, and
|
| 103 |
+
attribution notices from the Source form of the Work,
|
| 104 |
+
excluding those notices that do not pertain to any part of
|
| 105 |
+
the Derivative Works; and
|
| 106 |
+
|
| 107 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
| 108 |
+
distribution, then any Derivative Works that You distribute must
|
| 109 |
+
include a readable copy of the attribution notices contained
|
| 110 |
+
within such NOTICE file, excluding those notices that do not
|
| 111 |
+
pertain to any part of the Derivative Works, in at least one
|
| 112 |
+
of the following places: within a NOTICE text file distributed
|
| 113 |
+
as part of the Derivative Works; within the Source form or
|
| 114 |
+
documentation, if provided along with the Derivative Works; or,
|
| 115 |
+
within a display generated by the Derivative Works, if and
|
| 116 |
+
wherever such third-party notices normally appear. The contents
|
| 117 |
+
of the NOTICE file are for informational purposes only and
|
| 118 |
+
do not modify the License. You may add Your own attribution
|
| 119 |
+
notices within Derivative Works that You distribute, alongside
|
| 120 |
+
or as an addendum to the NOTICE text from the Work, provided
|
| 121 |
+
that such additional attribution notices cannot be construed
|
| 122 |
+
as modifying the License.
|
| 123 |
+
|
| 124 |
+
You may add Your own copyright statement to Your modifications and
|
| 125 |
+
may provide additional or different license terms and conditions
|
| 126 |
+
for use, reproduction, or distribution of Your modifications, or
|
| 127 |
+
for any such Derivative Works as a whole, provided Your use,
|
| 128 |
+
reproduction, and distribution of the Work otherwise complies with
|
| 129 |
+
the conditions stated in this License.
|
| 130 |
+
|
| 131 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
| 132 |
+
any Contribution intentionally submitted for inclusion in the Work
|
| 133 |
+
by You to the Licensor shall be under the terms and conditions of
|
| 134 |
+
this License, without any additional terms or conditions.
|
| 135 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
| 136 |
+
the terms of any separate license agreement you may have executed
|
| 137 |
+
with Licensor regarding such Contributions.
|
| 138 |
+
|
| 139 |
+
6. Trademarks. This License does not grant permission to use the trade
|
| 140 |
+
names, trademarks, service marks, or product names of the Licensor,
|
| 141 |
+
except as required for reasonable and customary use in describing the
|
| 142 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
| 143 |
+
|
| 144 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
| 145 |
+
agreed to in writing, Licensor provides the Work (and each
|
| 146 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
| 147 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 148 |
+
implied, including, without limitation, any warranties or conditions
|
| 149 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
| 150 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
| 151 |
+
appropriateness of using or redistributing the Work and assume any
|
| 152 |
+
risks associated with Your exercise of permissions under this License.
|
| 153 |
+
|
| 154 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
| 155 |
+
whether in tort (including negligence), contract, or otherwise,
|
| 156 |
+
unless required by applicable law (such as deliberate and grossly
|
| 157 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
| 158 |
+
liable to You for damages, including any direct, indirect, special,
|
| 159 |
+
incidental, or consequential damages of any character arising as a
|
| 160 |
+
result of this License or out of the use or inability to use the
|
| 161 |
+
Work (including but not limited to damages for loss of goodwill,
|
| 162 |
+
work stoppage, computer failure or malfunction, or any and all
|
| 163 |
+
other commercial damages or losses), even if such Contributor
|
| 164 |
+
has been advised of the possibility of such damages.
|
| 165 |
+
|
| 166 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
| 167 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
| 168 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
| 169 |
+
or other liability obligations and/or rights consistent with this
|
| 170 |
+
License. However, in accepting such obligations, You may act only
|
| 171 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
| 172 |
+
of any other Contributor, and only if You agree to indemnify,
|
| 173 |
+
defend, and hold each Contributor harmless for any liability
|
| 174 |
+
incurred by, or claims asserted against, such Contributor by reason
|
| 175 |
+
of your accepting any such warranty or additional liability.
|
| 176 |
+
|
| 177 |
+
END OF TERMS AND CONDITIONS
|
| 178 |
+
|
| 179 |
+
APPENDIX: How to apply the Apache License to your work.
|
| 180 |
+
|
| 181 |
+
To apply the Apache License to your work, attach the following
|
| 182 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
| 183 |
+
replaced with your own identifying information. (Don't include
|
| 184 |
+
the brackets!) The text should be enclosed in the appropriate
|
| 185 |
+
comment syntax for the file format. We also recommend that a
|
| 186 |
+
file or class name and description of purpose be included on the
|
| 187 |
+
same "printed page" as the copyright notice for easier
|
| 188 |
+
identification within third-party archives.
|
| 189 |
+
|
| 190 |
+
Copyright 2025 Google LLC
|
| 191 |
+
|
| 192 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
| 193 |
+
you may not use this file except in compliance with the License.
|
| 194 |
+
You may obtain a copy of the License at
|
| 195 |
+
|
| 196 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 197 |
+
|
| 198 |
+
Unless required by applicable law or agreed to in writing, software
|
| 199 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
| 200 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 201 |
+
See the License for the specific language governing permissions and
|
| 202 |
+
limitations under the License.
|
README.md
CHANGED
|
@@ -1,12 +1,145 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
|
| 4 |
-
colorFrom: gray
|
| 5 |
-
colorTo: gray
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 5.49.1
|
| 8 |
-
app_file: app.py
|
| 9 |
-
pinned: false
|
| 10 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: computer-use-preview
|
| 3 |
+
app_file: app.py
|
|
|
|
|
|
|
| 4 |
sdk: gradio
|
| 5 |
sdk_version: 5.49.1
|
|
|
|
|
|
|
| 6 |
---
|
| 7 |
+
# Computer Use Preview
|
| 8 |
+
|
| 9 |
+
## Quick Start
|
| 10 |
+
|
| 11 |
+
This section will guide you through setting up and running the Computer Use Preview model, either the Gemini Developer API or Vertex AI. Follow these steps to get started.
|
| 12 |
+
|
| 13 |
+
### 1. Installation
|
| 14 |
+
|
| 15 |
+
**Clone the Repository**
|
| 16 |
+
|
| 17 |
+
```bash
|
| 18 |
+
git clone https://github.com/google/computer-use-preview.git
|
| 19 |
+
cd computer-use-preview
|
| 20 |
+
```
|
| 21 |
+
|
| 22 |
+
**Set up Python Virtual Environment and Install Dependencies**
|
| 23 |
+
|
| 24 |
+
```bash
|
| 25 |
+
python3 -m venv .venv
|
| 26 |
+
source .venv/bin/activate
|
| 27 |
+
pip install -r requirements.txt
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
**Install Playwright and Browser Dependencies**
|
| 31 |
+
|
| 32 |
+
```bash
|
| 33 |
+
# Install system dependencies required by Playwright for Chrome
|
| 34 |
+
playwright install-deps chrome
|
| 35 |
+
|
| 36 |
+
# Install the Chrome browser for Playwright
|
| 37 |
+
playwright install chrome
|
| 38 |
+
```
|
| 39 |
+
|
| 40 |
+
### 2. Configuration
|
| 41 |
+
You can get started using either the Gemini Developer API or Vertex AI.
|
| 42 |
+
|
| 43 |
+
#### A. If using the Gemini Developer API:
|
| 44 |
+
|
| 45 |
+
You need a Gemini API key to use the agent:
|
| 46 |
+
|
| 47 |
+
```bash
|
| 48 |
+
export GEMINI_API_KEY="YOUR_GEMINI_API_KEY"
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
Or to add this to your virtual environment:
|
| 52 |
+
|
| 53 |
+
```bash
|
| 54 |
+
echo 'export GEMINI_API_KEY="YOUR_GEMINI_API_KEY"' >> .venv/bin/activate
|
| 55 |
+
# After editing, you'll need to deactivate and reactivate your virtual
|
| 56 |
+
# environment if it's already active:
|
| 57 |
+
deactivate
|
| 58 |
+
source .venv/bin/activate
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
Replace `YOUR_GEMINI_API_KEY` with your actual key.
|
| 62 |
+
|
| 63 |
+
#### B. If using the Vertex AI Client:
|
| 64 |
+
|
| 65 |
+
You need to explicitly use Vertex AI, then provide project and location to use the agent:
|
| 66 |
+
|
| 67 |
+
```bash
|
| 68 |
+
export USE_VERTEXAI=true
|
| 69 |
+
export VERTEXAI_PROJECT="YOUR_PROJECT_ID"
|
| 70 |
+
export VERTEXAI_LOCATION="YOUR_LOCATION"
|
| 71 |
+
```
|
| 72 |
+
|
| 73 |
+
Or to add this to your virtual environment:
|
| 74 |
+
|
| 75 |
+
```bash
|
| 76 |
+
echo 'export USE_VERTEXAI=true' >> .venv/bin/activate
|
| 77 |
+
echo 'export VERTEXAI_PROJECT="your-project-id"' >> .venv/bin/activate
|
| 78 |
+
echo 'export VERTEXAI_LOCATION="your-location"' >> .venv/bin/activate
|
| 79 |
+
# After editing, you'll need to deactivate and reactivate your virtual
|
| 80 |
+
# environment if it's already active:
|
| 81 |
+
deactivate
|
| 82 |
+
source .venv/bin/activate
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
Replace `YOUR_PROJECT_ID` and `YOUR_LOCATION` with your actual project and location.
|
| 86 |
+
|
| 87 |
+
### 3. Running the Tool
|
| 88 |
+
|
| 89 |
+
The primary way to use the tool is via the `main.py` script.
|
| 90 |
+
|
| 91 |
+
**General Command Structure:**
|
| 92 |
+
|
| 93 |
+
```bash
|
| 94 |
+
python main.py --query "Go to Google and type 'Hello World' into the search bar"
|
| 95 |
+
```
|
| 96 |
+
|
| 97 |
+
**Available Environments:**
|
| 98 |
+
|
| 99 |
+
You can specify a particular environment with the ```--env <environment>``` flag. Available options:
|
| 100 |
+
|
| 101 |
+
- `playwright`: Runs the browser locally using Playwright.
|
| 102 |
+
- `browserbase`: Connects to a Browserbase instance.
|
| 103 |
+
|
| 104 |
+
**Local Playwright**
|
| 105 |
+
|
| 106 |
+
Runs the agent using a Chrome browser instance controlled locally by Playwright.
|
| 107 |
+
|
| 108 |
+
```bash
|
| 109 |
+
python main.py --query="Go to Google and type 'Hello World' into the search bar" --env="playwright"
|
| 110 |
+
```
|
| 111 |
+
|
| 112 |
+
You can also specify an initial URL for the Playwright environment:
|
| 113 |
+
|
| 114 |
+
```bash
|
| 115 |
+
python main.py --query="Go to Google and type 'Hello World' into the search bar" --env="playwright" --initial_url="https://www.google.com/search?q=latest+AI+news"
|
| 116 |
+
```
|
| 117 |
+
|
| 118 |
+
**Browserbase**
|
| 119 |
+
|
| 120 |
+
Runs the agent using Browserbase as the browser backend. Ensure the proper Browserbase environment variables are set:`BROWSERBASE_API_KEY` and `BROWSERBASE_PROJECT_ID`.
|
| 121 |
+
|
| 122 |
+
```bash
|
| 123 |
+
python main.py --query="Go to Google and type 'Hello World' into the search bar" --env="browserbase"
|
| 124 |
+
```
|
| 125 |
+
|
| 126 |
+
## Agent CLI
|
| 127 |
+
|
| 128 |
+
The `main.py` script is the command-line interface (CLI) for running the browser agent.
|
| 129 |
+
|
| 130 |
+
### Command-Line Arguments
|
| 131 |
+
|
| 132 |
+
| Argument | Description | Required | Default | Supported Environment(s) |
|
| 133 |
+
|-|-|-|-|-|
|
| 134 |
+
| `--query` | The natural language query for the browser agent to execute. | Yes | N/A | All |
|
| 135 |
+
| `--env` | The computer use environment to use. Must be one of the following: `playwright`, or `browserbase` | No | N/A | All |
|
| 136 |
+
| `--initial_url` | The initial URL to load when the browser starts. | No | https://www.google.com | All |
|
| 137 |
+
| `--highlight_mouse` | If specified, the agent will attempt to highlight the mouse cursor's position in the screenshots. This is useful for visual debugging. | No | False (not highlighted) | `playwright` |
|
| 138 |
+
|
| 139 |
+
### Environment Variables
|
| 140 |
|
| 141 |
+
| Variable | Description | Required |
|
| 142 |
+
|-|-|-|
|
| 143 |
+
| GEMINI_API_KEY | Your API key for the Gemini model. | Yes |
|
| 144 |
+
| BROWSERBASE_API_KEY | Your API key for Browserbase. | Yes (when using the browserbase environment) |
|
| 145 |
+
| BROWSERBASE_PROJECT_ID | Your Project ID for Browserbase. | Yes (when using the browserbase environment) |
|
agent.py
ADDED
|
@@ -0,0 +1,568 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2025 Google LLC
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
import os
|
| 15 |
+
from typing import Literal, Optional, Union, Any
|
| 16 |
+
from google import genai
|
| 17 |
+
from google.genai import types
|
| 18 |
+
import termcolor
|
| 19 |
+
from google.genai.types import (
|
| 20 |
+
Part,
|
| 21 |
+
GenerateContentConfig,
|
| 22 |
+
Content,
|
| 23 |
+
Candidate,
|
| 24 |
+
FunctionResponse,
|
| 25 |
+
FinishReason,
|
| 26 |
+
)
|
| 27 |
+
import time
|
| 28 |
+
from rich.console import Console
|
| 29 |
+
from rich.table import Table
|
| 30 |
+
|
| 31 |
+
from computers import EnvState, Computer
|
| 32 |
+
|
| 33 |
+
MAX_RECENT_TURN_WITH_SCREENSHOTS = 5
|
| 34 |
+
PREDEFINED_COMPUTER_USE_FUNCTIONS = [
|
| 35 |
+
"open_web_browser",
|
| 36 |
+
"click_at",
|
| 37 |
+
"hover_at",
|
| 38 |
+
"type_text_at",
|
| 39 |
+
"scroll_document",
|
| 40 |
+
"scroll_at",
|
| 41 |
+
"wait_5_seconds",
|
| 42 |
+
"go_back",
|
| 43 |
+
"go_forward",
|
| 44 |
+
"search",
|
| 45 |
+
"navigate",
|
| 46 |
+
"key_combination",
|
| 47 |
+
"drag_and_drop",
|
| 48 |
+
]
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
console = Console()
|
| 52 |
+
|
| 53 |
+
# Built-in Computer Use tools will return "EnvState".
|
| 54 |
+
# Custom provided functions will return "dict".
|
| 55 |
+
FunctionResponseT = Union[EnvState, dict]
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def multiply_numbers(x: float, y: float) -> dict:
|
| 59 |
+
"""Multiplies two numbers.
|
| 60 |
+
|
| 61 |
+
Args:
|
| 62 |
+
x (float): The first number.
|
| 63 |
+
y (float): The second number.
|
| 64 |
+
|
| 65 |
+
Returns:
|
| 66 |
+
dict: The result of multiplication.
|
| 67 |
+
"""
|
| 68 |
+
return {"result": x * y}
|
| 69 |
+
|
| 70 |
+
# def multiply_numbers(x: float, y: float) -> dict:
|
| 71 |
+
# """Multiplies two numbers.
|
| 72 |
+
|
| 73 |
+
# Args:
|
| 74 |
+
# x (float): The first number.
|
| 75 |
+
# y (float): The second number.
|
| 76 |
+
|
| 77 |
+
# Returns:
|
| 78 |
+
# dict: The result of multiplication.
|
| 79 |
+
# """
|
| 80 |
+
# return {"result": x * y}
|
| 81 |
+
|
| 82 |
+
# 在从types.FunctionDeclaration.from_callable中创建函数声明时,这里的__doc__中所有信息都会被转化为description, 即使其风格符合google style, 但不会进行进一步的解析成参数和返回值的说明
|
| 83 |
+
# 如果参数含义复杂需要在其中再补充一下说明
|
| 84 |
+
def evaluate_answer(query: str, answer:str) -> None:
|
| 85 |
+
"""
|
| 86 |
+
Evaluates the answer based on the given query.
|
| 87 |
+
"""
|
| 88 |
+
return None
|
| 89 |
+
|
| 90 |
+
class BrowserAgent:
|
| 91 |
+
def __init__(
|
| 92 |
+
self,
|
| 93 |
+
browser_computer: Computer,
|
| 94 |
+
query: str,
|
| 95 |
+
system_prompt: str,
|
| 96 |
+
model_name: str,
|
| 97 |
+
verbose: bool = True,
|
| 98 |
+
):
|
| 99 |
+
self._browser_computer = browser_computer
|
| 100 |
+
self._query = query
|
| 101 |
+
self._model_name = model_name
|
| 102 |
+
self._verbose = verbose
|
| 103 |
+
self.final_reasoning = None
|
| 104 |
+
self._client = genai.Client(
|
| 105 |
+
api_key=os.environ.get("GEMINI_API_KEY"),
|
| 106 |
+
vertexai=os.environ.get("USE_VERTEXAI", "0").lower() in ["true", "1"],
|
| 107 |
+
project=os.environ.get("VERTEXAI_PROJECT"),
|
| 108 |
+
location=os.environ.get("VERTEXAI_LOCATION"),
|
| 109 |
+
)
|
| 110 |
+
self._contents: list[Content] = [
|
| 111 |
+
Content(
|
| 112 |
+
role="user",
|
| 113 |
+
parts=[
|
| 114 |
+
Part(text=self._query),
|
| 115 |
+
],
|
| 116 |
+
)
|
| 117 |
+
]
|
| 118 |
+
|
| 119 |
+
# Exclude any predefined functions here.
|
| 120 |
+
excluded_predefined_functions = ["scroll_document"] # 先屏蔽使用这个滚动函数
|
| 121 |
+
|
| 122 |
+
# Add your own custom functions here.
|
| 123 |
+
custom_functions = [
|
| 124 |
+
# For example:
|
| 125 |
+
types.FunctionDeclaration.from_callable(
|
| 126 |
+
client=self._client, callable=multiply_numbers
|
| 127 |
+
)
|
| 128 |
+
]
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
self._generate_content_config = GenerateContentConfig(
|
| 132 |
+
temperature=1,
|
| 133 |
+
top_p=0.95,
|
| 134 |
+
top_k=40,
|
| 135 |
+
max_output_tokens=8192,
|
| 136 |
+
tools=[
|
| 137 |
+
types.Tool(
|
| 138 |
+
computer_use=types.ComputerUse(
|
| 139 |
+
environment=types.Environment.ENVIRONMENT_BROWSER,
|
| 140 |
+
excluded_predefined_functions=excluded_predefined_functions,
|
| 141 |
+
),
|
| 142 |
+
),
|
| 143 |
+
types.Tool(function_declarations=custom_functions),
|
| 144 |
+
],
|
| 145 |
+
system_instruction=[system_prompt] # 定义system prompt
|
| 146 |
+
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
def handle_action(self, action: types.FunctionCall) -> FunctionResponseT:
|
| 150 |
+
"""Handles the action and returns the environment state."""
|
| 151 |
+
if action.name == "open_web_browser":
|
| 152 |
+
return self._browser_computer.open_web_browser()
|
| 153 |
+
elif action.name == "click_at":
|
| 154 |
+
x = self.denormalize_x(action.args["x"])
|
| 155 |
+
y = self.denormalize_y(action.args["y"])
|
| 156 |
+
return self._browser_computer.click_at(
|
| 157 |
+
x=x,
|
| 158 |
+
y=y,
|
| 159 |
+
)
|
| 160 |
+
elif action.name == "hover_at":
|
| 161 |
+
x = self.denormalize_x(action.args["x"])
|
| 162 |
+
y = self.denormalize_y(action.args["y"])
|
| 163 |
+
return self._browser_computer.hover_at(
|
| 164 |
+
x=x,
|
| 165 |
+
y=y,
|
| 166 |
+
)
|
| 167 |
+
elif action.name == "type_text_at":
|
| 168 |
+
x = self.denormalize_x(action.args["x"])
|
| 169 |
+
y = self.denormalize_y(action.args["y"])
|
| 170 |
+
press_enter = action.args.get("press_enter", False)
|
| 171 |
+
clear_before_typing = action.args.get("clear_before_typing", True)
|
| 172 |
+
return self._browser_computer.type_text_at(
|
| 173 |
+
x=x,
|
| 174 |
+
y=y,
|
| 175 |
+
text=action.args["text"],
|
| 176 |
+
press_enter=press_enter,
|
| 177 |
+
clear_before_typing=clear_before_typing,
|
| 178 |
+
)
|
| 179 |
+
elif action.name == "scroll_document":
|
| 180 |
+
return self._browser_computer.scroll_document(action.args["direction"])
|
| 181 |
+
elif action.name == "scroll_at":
|
| 182 |
+
x = self.denormalize_x(action.args["x"])
|
| 183 |
+
y = self.denormalize_y(action.args["y"])
|
| 184 |
+
magnitude = action.args.get("magnitude", 800)
|
| 185 |
+
direction = action.args["direction"]
|
| 186 |
+
|
| 187 |
+
if direction in ("up", "down"):
|
| 188 |
+
magnitude = self.denormalize_y(magnitude)
|
| 189 |
+
elif direction in ("left", "right"):
|
| 190 |
+
magnitude = self.denormalize_x(magnitude)
|
| 191 |
+
else:
|
| 192 |
+
raise ValueError("Unknown direction: ", direction)
|
| 193 |
+
return self._browser_computer.scroll_at(
|
| 194 |
+
x=x, y=y, direction=direction, magnitude=magnitude
|
| 195 |
+
)
|
| 196 |
+
elif action.name == "wait_5_seconds":
|
| 197 |
+
return self._browser_computer.wait_5_seconds()
|
| 198 |
+
elif action.name == "go_back":
|
| 199 |
+
return self._browser_computer.go_back()
|
| 200 |
+
elif action.name == "go_forward":
|
| 201 |
+
return self._browser_computer.go_forward()
|
| 202 |
+
elif action.name == "search":
|
| 203 |
+
return self._browser_computer.search()
|
| 204 |
+
elif action.name == "navigate":
|
| 205 |
+
return self._browser_computer.navigate(action.args["url"])
|
| 206 |
+
elif action.name == "key_combination":
|
| 207 |
+
return self._browser_computer.key_combination(
|
| 208 |
+
action.args["keys"].split("+")
|
| 209 |
+
)
|
| 210 |
+
elif action.name == "drag_and_drop":
|
| 211 |
+
x = self.denormalize_x(action.args["x"])
|
| 212 |
+
y = self.denormalize_y(action.args["y"])
|
| 213 |
+
destination_x = self.denormalize_x(action.args["destination_x"])
|
| 214 |
+
destination_y = self.denormalize_y(action.args["destination_y"])
|
| 215 |
+
return self._browser_computer.drag_and_drop(
|
| 216 |
+
x=x,
|
| 217 |
+
y=y,
|
| 218 |
+
destination_x=destination_x,
|
| 219 |
+
destination_y=destination_y,
|
| 220 |
+
)
|
| 221 |
+
# Handle the custom function declarations here.
|
| 222 |
+
elif action.name == multiply_numbers.__name__:
|
| 223 |
+
return multiply_numbers(x=action.args["x"], y=action.args["y"])
|
| 224 |
+
else:
|
| 225 |
+
raise ValueError(f"Unsupported function: {action}")
|
| 226 |
+
|
| 227 |
+
def get_model_response(
|
| 228 |
+
self, max_retries=5, base_delay_s=1
|
| 229 |
+
) -> types.GenerateContentResponse:
|
| 230 |
+
for attempt in range(max_retries):
|
| 231 |
+
try:
|
| 232 |
+
response = self._client.models.generate_content(
|
| 233 |
+
model=self._model_name,
|
| 234 |
+
contents=self._contents,
|
| 235 |
+
config=self._generate_content_config,
|
| 236 |
+
)
|
| 237 |
+
return response # Return response on success
|
| 238 |
+
except Exception as e:
|
| 239 |
+
print(e)
|
| 240 |
+
if attempt < max_retries - 1:
|
| 241 |
+
delay = base_delay_s * (2**attempt)
|
| 242 |
+
message = (
|
| 243 |
+
f"Generating content failed on attempt {attempt + 1}. "
|
| 244 |
+
f"Retrying in {delay} seconds...\n"
|
| 245 |
+
)
|
| 246 |
+
termcolor.cprint(
|
| 247 |
+
message,
|
| 248 |
+
color="yellow",
|
| 249 |
+
)
|
| 250 |
+
time.sleep(delay)
|
| 251 |
+
else:
|
| 252 |
+
termcolor.cprint(
|
| 253 |
+
f"Generating content failed after {max_retries} attempts.\n",
|
| 254 |
+
color="red",
|
| 255 |
+
)
|
| 256 |
+
raise
|
| 257 |
+
|
| 258 |
+
def get_text(self, candidate: Candidate) -> Optional[str]:
|
| 259 |
+
"""Extracts the text from the candidate."""
|
| 260 |
+
if not candidate.content or not candidate.content.parts:
|
| 261 |
+
return None
|
| 262 |
+
text = []
|
| 263 |
+
# Gemini 在多模态的场景下,会分段返回内容,例如,除了text和function_call之外,也可能包含其他的部分.
|
| 264 |
+
# {
|
| 265 |
+
# "content": {
|
| 266 |
+
# "parts": [
|
| 267 |
+
# {"text": "I’ll start by searching Google for relevant documentation."},
|
| 268 |
+
# {"function_call": {...}},
|
| 269 |
+
# {"text": "Now that I found it, here’s the explanation:"},
|
| 270 |
+
# {"text": "In Python, decorators are higher-order functions..."}
|
| 271 |
+
# ]
|
| 272 |
+
# }
|
| 273 |
+
# }
|
| 274 |
+
for part in candidate.content.parts:
|
| 275 |
+
if part.text:
|
| 276 |
+
text.append(part.text)
|
| 277 |
+
return " ".join(text) or None
|
| 278 |
+
|
| 279 |
+
def extract_function_calls(self, candidate: Candidate) -> list[types.FunctionCall]:
|
| 280 |
+
"""Extracts the function call from the candidate."""
|
| 281 |
+
if not candidate.content or not candidate.content.parts:
|
| 282 |
+
return []
|
| 283 |
+
ret = []
|
| 284 |
+
for part in candidate.content.parts:
|
| 285 |
+
if part.function_call:
|
| 286 |
+
ret.append(part.function_call)
|
| 287 |
+
return ret
|
| 288 |
+
|
| 289 |
+
# reasoning, screenshot_base64, action, status, url
|
| 290 |
+
def run_one_iteration_modify(self):
|
| 291 |
+
# Generate a response from the model.
|
| 292 |
+
try:
|
| 293 |
+
response = self.get_model_response()
|
| 294 |
+
except Exception as e:
|
| 295 |
+
return "Error occurred", "COMPLETE", []
|
| 296 |
+
|
| 297 |
+
if not response.candidates:
|
| 298 |
+
return "No candidates in response", "COMPLETE", []
|
| 299 |
+
|
| 300 |
+
candidate = response.candidates[0]
|
| 301 |
+
if candidate.content:
|
| 302 |
+
self._contents.append(candidate.content)
|
| 303 |
+
|
| 304 |
+
reasoning = self.get_text(candidate)
|
| 305 |
+
function_calls = self.extract_function_calls(candidate)
|
| 306 |
+
|
| 307 |
+
if not function_calls:
|
| 308 |
+
self.final_reasoning = reasoning
|
| 309 |
+
return reasoning, "COMPLETE", []
|
| 310 |
+
|
| 311 |
+
# Process each function call and collect results
|
| 312 |
+
function_responses_list = []
|
| 313 |
+
status = "CONTINUE"
|
| 314 |
+
|
| 315 |
+
for function_call in function_calls:
|
| 316 |
+
try:
|
| 317 |
+
fc_result = self.handle_action(function_call)
|
| 318 |
+
|
| 319 |
+
screenshot_base64 = ""
|
| 320 |
+
url = ""
|
| 321 |
+
response = {}
|
| 322 |
+
if isinstance(fc_result, EnvState): # 浏览器操作事件
|
| 323 |
+
screenshot_base64 = fc_result.screenshot
|
| 324 |
+
url = fc_result.url
|
| 325 |
+
response = {"url": url}
|
| 326 |
+
function_response = FunctionResponse(
|
| 327 |
+
name=function_call.name,
|
| 328 |
+
response={"url": url},
|
| 329 |
+
parts=[types.FunctionResponsePart(
|
| 330 |
+
inline_data=types.FunctionResponseBlob(
|
| 331 |
+
mime_type="image/png", data=screenshot_base64
|
| 332 |
+
)
|
| 333 |
+
)]
|
| 334 |
+
)
|
| 335 |
+
self._contents.append(Content(
|
| 336 |
+
role="user",
|
| 337 |
+
parts=[Part(function_response=function_response)],
|
| 338 |
+
))
|
| 339 |
+
elif isinstance(fc_result, dict): # 自定义函数
|
| 340 |
+
response = fc_result
|
| 341 |
+
self._contents.append(Content(
|
| 342 |
+
role="user",
|
| 343 |
+
parts=[Part(function_response=FunctionResponse(
|
| 344 |
+
name=function_call.name,
|
| 345 |
+
response=fc_result
|
| 346 |
+
))],
|
| 347 |
+
))
|
| 348 |
+
|
| 349 |
+
# Add to function_responses_list
|
| 350 |
+
function_responses_list.append({
|
| 351 |
+
"screenshot": screenshot_base64,
|
| 352 |
+
"action": function_call.name,
|
| 353 |
+
"response": response,
|
| 354 |
+
})
|
| 355 |
+
|
| 356 |
+
except Exception as e:
|
| 357 |
+
return f"Error handling action {function_call.name}: {str(e)}", "COMPLETE", function_responses_list
|
| 358 |
+
|
| 359 |
+
# only keep screenshots in the few most recent turns, remove the screenshot images from the old turns.
|
| 360 |
+
turn_with_screenshots_found = 0
|
| 361 |
+
for content in reversed(self._contents):
|
| 362 |
+
if content.role == "user" and content.parts:
|
| 363 |
+
# check if content has screenshot of the predefined computer use functions.
|
| 364 |
+
has_screenshot = False
|
| 365 |
+
for part in content.parts:
|
| 366 |
+
if (
|
| 367 |
+
part.function_response
|
| 368 |
+
and part.function_response.parts
|
| 369 |
+
and part.function_response.name
|
| 370 |
+
in PREDEFINED_COMPUTER_USE_FUNCTIONS
|
| 371 |
+
):
|
| 372 |
+
has_screenshot = True
|
| 373 |
+
break
|
| 374 |
+
|
| 375 |
+
if has_screenshot:
|
| 376 |
+
turn_with_screenshots_found += 1
|
| 377 |
+
# remove the screenshot image if the number of screenshots exceed the limit.
|
| 378 |
+
if turn_with_screenshots_found > MAX_RECENT_TURN_WITH_SCREENSHOTS:
|
| 379 |
+
for part in content.parts:
|
| 380 |
+
if (
|
| 381 |
+
part.function_response
|
| 382 |
+
and part.function_response.parts
|
| 383 |
+
and part.function_response.name
|
| 384 |
+
in PREDEFINED_COMPUTER_USE_FUNCTIONS
|
| 385 |
+
):
|
| 386 |
+
part.function_response.parts = None
|
| 387 |
+
|
| 388 |
+
return reasoning, status, function_responses_list
|
| 389 |
+
|
| 390 |
+
def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]:
|
| 391 |
+
|
| 392 |
+
# Generate a response from the model.
|
| 393 |
+
if self._verbose:
|
| 394 |
+
with console.status(
|
| 395 |
+
"Generating response from Gemini Computer Use...", spinner_style=None
|
| 396 |
+
):
|
| 397 |
+
try:
|
| 398 |
+
response = self.get_model_response()
|
| 399 |
+
except Exception as e:
|
| 400 |
+
return "COMPLETE"
|
| 401 |
+
else:
|
| 402 |
+
try:
|
| 403 |
+
response = self.get_model_response()
|
| 404 |
+
except Exception as e:
|
| 405 |
+
return "COMPLETE"
|
| 406 |
+
|
| 407 |
+
if not response.candidates:
|
| 408 |
+
print("Response has no candidates!")
|
| 409 |
+
print(response)
|
| 410 |
+
raise ValueError("Empty response")
|
| 411 |
+
|
| 412 |
+
# Extract the text and function call from the response.
|
| 413 |
+
candidate = response.candidates[0]
|
| 414 |
+
# Append the model turn to conversation history.
|
| 415 |
+
if candidate.content:
|
| 416 |
+
self._contents.append(candidate.content)
|
| 417 |
+
|
| 418 |
+
reasoning = self.get_text(candidate)
|
| 419 |
+
function_calls = self.extract_function_calls(candidate)
|
| 420 |
+
|
| 421 |
+
# Retry the request in case of malformed FCs.
|
| 422 |
+
if (
|
| 423 |
+
not function_calls
|
| 424 |
+
and not reasoning
|
| 425 |
+
and candidate.finish_reason == FinishReason.MALFORMED_FUNCTION_CALL
|
| 426 |
+
):
|
| 427 |
+
return "CONTINUE"
|
| 428 |
+
|
| 429 |
+
if not function_calls:
|
| 430 |
+
print(f"Agent Loop Complete: {reasoning}")
|
| 431 |
+
self.final_reasoning = reasoning
|
| 432 |
+
return "COMPLETE"
|
| 433 |
+
|
| 434 |
+
function_call_strs = []
|
| 435 |
+
for function_call in function_calls:
|
| 436 |
+
# Print the function call and any reasoning.
|
| 437 |
+
function_call_str = f"Name: {function_call.name}"
|
| 438 |
+
if function_call.args:
|
| 439 |
+
function_call_str += f"\nArgs:"
|
| 440 |
+
for key, value in function_call.args.items():
|
| 441 |
+
function_call_str += f"\n {key}: {value}"
|
| 442 |
+
function_call_strs.append(function_call_str)
|
| 443 |
+
|
| 444 |
+
table = Table(expand=True)
|
| 445 |
+
table.add_column(
|
| 446 |
+
"Gemini Computer Use Reasoning", header_style="magenta", ratio=1,
|
| 447 |
+
no_wrap=False, # 允许换行
|
| 448 |
+
overflow="fold", # 超出部分折行显示
|
| 449 |
+
)
|
| 450 |
+
table.add_column("Function Call(s)", header_style="cyan", ratio=1)
|
| 451 |
+
table.add_row(reasoning, "\n".join(function_call_strs))
|
| 452 |
+
if self._verbose:
|
| 453 |
+
console.print(table)
|
| 454 |
+
print()
|
| 455 |
+
|
| 456 |
+
function_responses = []
|
| 457 |
+
for function_call in function_calls:
|
| 458 |
+
extra_fr_fields = {}
|
| 459 |
+
if function_call.args and (
|
| 460 |
+
safety := function_call.args.get("safety_decision")
|
| 461 |
+
):
|
| 462 |
+
decision = self._get_safety_confirmation(safety)
|
| 463 |
+
if decision == "TERMINATE":
|
| 464 |
+
print("Terminating agent loop")
|
| 465 |
+
return "COMPLETE"
|
| 466 |
+
# Explicitly mark the safety check as acknowledged.
|
| 467 |
+
extra_fr_fields["safety_acknowledgement"] = "true"
|
| 468 |
+
if self._verbose:
|
| 469 |
+
with console.status(
|
| 470 |
+
"Sending command to Computer...", spinner_style=None
|
| 471 |
+
):
|
| 472 |
+
fc_result = self.handle_action(function_call)
|
| 473 |
+
else:
|
| 474 |
+
fc_result = self.handle_action(function_call)
|
| 475 |
+
if isinstance(fc_result, EnvState):
|
| 476 |
+
function_responses.append(
|
| 477 |
+
FunctionResponse(
|
| 478 |
+
name=function_call.name,
|
| 479 |
+
response={
|
| 480 |
+
"url": fc_result.url,
|
| 481 |
+
**extra_fr_fields,
|
| 482 |
+
},
|
| 483 |
+
parts=[
|
| 484 |
+
types.FunctionResponsePart(
|
| 485 |
+
inline_data=types.FunctionResponseBlob(
|
| 486 |
+
mime_type="image/png", data=fc_result.screenshot
|
| 487 |
+
)
|
| 488 |
+
)
|
| 489 |
+
],
|
| 490 |
+
)
|
| 491 |
+
)
|
| 492 |
+
elif isinstance(fc_result, dict):
|
| 493 |
+
function_responses.append(
|
| 494 |
+
FunctionResponse(name=function_call.name, response=fc_result)
|
| 495 |
+
)
|
| 496 |
+
|
| 497 |
+
self._contents.append(
|
| 498 |
+
Content(
|
| 499 |
+
role="user",
|
| 500 |
+
parts=[Part(function_response=fr) for fr in function_responses],
|
| 501 |
+
)
|
| 502 |
+
)
|
| 503 |
+
|
| 504 |
+
# only keep screenshots in the few most recent turns, remove the screenshot images from the old turns.
|
| 505 |
+
turn_with_screenshots_found = 0
|
| 506 |
+
for content in reversed(self._contents):
|
| 507 |
+
if content.role == "user" and content.parts:
|
| 508 |
+
# check if content has screenshot of the predefined computer use functions.
|
| 509 |
+
has_screenshot = False
|
| 510 |
+
for part in content.parts:
|
| 511 |
+
if (
|
| 512 |
+
part.function_response
|
| 513 |
+
and part.function_response.parts
|
| 514 |
+
and part.function_response.name
|
| 515 |
+
in PREDEFINED_COMPUTER_USE_FUNCTIONS
|
| 516 |
+
):
|
| 517 |
+
has_screenshot = True
|
| 518 |
+
break
|
| 519 |
+
|
| 520 |
+
if has_screenshot:
|
| 521 |
+
turn_with_screenshots_found += 1
|
| 522 |
+
# remove the screenshot image if the number of screenshots exceed the limit.
|
| 523 |
+
if turn_with_screenshots_found > MAX_RECENT_TURN_WITH_SCREENSHOTS:
|
| 524 |
+
for part in content.parts:
|
| 525 |
+
if (
|
| 526 |
+
part.function_response
|
| 527 |
+
and part.function_response.parts
|
| 528 |
+
and part.function_response.name
|
| 529 |
+
in PREDEFINED_COMPUTER_USE_FUNCTIONS
|
| 530 |
+
):
|
| 531 |
+
part.function_response.parts = None
|
| 532 |
+
|
| 533 |
+
return "CONTINUE"
|
| 534 |
+
|
| 535 |
+
def _get_safety_confirmation(
|
| 536 |
+
self, safety: dict[str, Any]
|
| 537 |
+
) -> Literal["CONTINUE", "TERMINATE"]:
|
| 538 |
+
if safety["decision"] != "require_confirmation":
|
| 539 |
+
raise ValueError(f"Unknown safety decision: safety['decision']")
|
| 540 |
+
termcolor.cprint(
|
| 541 |
+
"Safety service requires explicit confirmation!",
|
| 542 |
+
color="yellow",
|
| 543 |
+
attrs=["bold"],
|
| 544 |
+
)
|
| 545 |
+
print(safety["explanation"])
|
| 546 |
+
decision = ""
|
| 547 |
+
while decision.lower() not in ("y", "n", "ye", "yes", "no"):
|
| 548 |
+
decision = input("Do you wish to proceed? [Yes]/[No]\n")
|
| 549 |
+
if decision.lower() in ("n", "no"):
|
| 550 |
+
return "TERMINATE"
|
| 551 |
+
return "CONTINUE"
|
| 552 |
+
|
| 553 |
+
def agent_loop(self):
|
| 554 |
+
status = "CONTINUE"
|
| 555 |
+
while status == "CONTINUE":
|
| 556 |
+
status = self.run_one_iteration()
|
| 557 |
+
|
| 558 |
+
def agent_loop_yield(self):
|
| 559 |
+
status = "CONTINUE"
|
| 560 |
+
while status == "CONTINUE":
|
| 561 |
+
reasoning, status, function_responses_list = self.run_one_iteration_modify()
|
| 562 |
+
yield reasoning, status, function_responses_list
|
| 563 |
+
|
| 564 |
+
def denormalize_x(self, x: int) -> int:
|
| 565 |
+
return int(x / 1000 * self._browser_computer.screen_size()[0])
|
| 566 |
+
|
| 567 |
+
def denormalize_y(self, y: int) -> int:
|
| 568 |
+
return int(y / 1000 * self._browser_computer.screen_size()[1])
|
app.py
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import time
|
| 3 |
+
import base64
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
import argparse
|
| 6 |
+
import os
|
| 7 |
+
|
| 8 |
+
from agent import BrowserAgent
|
| 9 |
+
from computers import BrowserbaseComputer, PlaywrightComputer
|
| 10 |
+
import base64
|
| 11 |
+
import sys
|
| 12 |
+
|
| 13 |
+
sys.path.append(os.path.dirname(__file__))
|
| 14 |
+
|
| 15 |
+
from prompt import prompt_options
|
| 16 |
+
|
| 17 |
+
PLAYWRIGHT_SCREEN_SIZE = (1440, 900)
|
| 18 |
+
|
| 19 |
+
os.environ["PLAYWRIGHT_HEADLESS"] = "false"
|
| 20 |
+
|
| 21 |
+
def run_genflow(query:str, prompt:str = ""):
|
| 22 |
+
env = PlaywrightComputer(
|
| 23 |
+
screen_size=PLAYWRIGHT_SCREEN_SIZE,
|
| 24 |
+
initial_url="https://wenku.baidu.com/ndcore/browse/aiunion?fr=options_AIcard_1&_wkts_=1761290807747&bdQuery=genflow&t=1761290807744&tabType=genflow&aiCreat=genflow",
|
| 25 |
+
# initial_url="https://www.doubao.com/chat/",
|
| 26 |
+
highlight_mouse=True, # 如果指定,代理将尝试在屏幕截图中突出显示鼠标光标的位置。这对于可视化调试很有用。
|
| 27 |
+
)
|
| 28 |
+
# query="在提示文本为'输入问题,交给GenFlow的'搜索条中输入‘百度’,点击发送按钮",
|
| 29 |
+
|
| 30 |
+
prompt = "GenFlow是一个AI聊天机器人。你需要作为测试员,在它的网页底部搜索栏中输入用户提交的问题, 并且观察其输出结果。" + prompt
|
| 31 |
+
prompt += """
|
| 32 |
+
GenFlow 返回的内容可能比较长,你可以多次执行'scroll_at'操作来查看网页中它返回的上下文。
|
| 33 |
+
GenFlow 有时会呈现左边主页面,右边预览区域。在这种情况下执行'scroll_at'的时候你要注意鼠标位置。
|
| 34 |
+
通过反复的滚动,确保主页面和预览区域已经滚动到底。预览区域往往很长,所以你需要多次执行'scroll_at'操作,确保2次滚动操作看到的网页完全一样为止, 给出最终客观评价.
|
| 35 |
+
|
| 36 |
+
GenFlow 一定会给出'输出结果',请保持足够的耐心!
|
| 37 |
+
**无论输入的语言是什么,你需要输出中文**"""
|
| 38 |
+
# print(prompt)
|
| 39 |
+
with env as browser_computer:
|
| 40 |
+
agent = BrowserAgent(
|
| 41 |
+
browser_computer=browser_computer,
|
| 42 |
+
query=query,
|
| 43 |
+
system_prompt=prompt,
|
| 44 |
+
model_name='gemini-2.5-computer-use-preview-10-2025',
|
| 45 |
+
)
|
| 46 |
+
# reasoning, status, function_responses_list
|
| 47 |
+
for step in agent.agent_loop_yield():
|
| 48 |
+
# print(step)
|
| 49 |
+
# input("☕️")
|
| 50 |
+
yield step
|
| 51 |
+
|
| 52 |
+
# === 处理用户查询 ===
|
| 53 |
+
def process_user_query(query, prompt=""):
|
| 54 |
+
if not query.strip():
|
| 55 |
+
yield [], "请输入有效的查询", "<p>无执行步骤</p>"
|
| 56 |
+
return
|
| 57 |
+
|
| 58 |
+
steps = []
|
| 59 |
+
|
| 60 |
+
for reasoning, status, function_responses_list in run_genflow(query, prompt):
|
| 61 |
+
# 每个 function_responses_list 是若干个 ["screenshot", "action", "response"]
|
| 62 |
+
sub_steps = []
|
| 63 |
+
for item in function_responses_list:
|
| 64 |
+
if len(item) == 3:
|
| 65 |
+
screenshot_base64, action, response = item["screenshot"], item["action"], item["response"]
|
| 66 |
+
else:
|
| 67 |
+
screenshot_base64, action, response = "", "未知操作", ""
|
| 68 |
+
|
| 69 |
+
sub_steps.append({
|
| 70 |
+
"screenshot": f"data:image/png;base64,{base64.b64encode(screenshot_base64).decode("utf-8")}" if screenshot_base64 else "",
|
| 71 |
+
"action": action,
|
| 72 |
+
"response": response,
|
| 73 |
+
})
|
| 74 |
+
|
| 75 |
+
step = {
|
| 76 |
+
"step": len(steps) + 1,
|
| 77 |
+
"reasoning": reasoning,
|
| 78 |
+
"status": status,
|
| 79 |
+
"functions": sub_steps, # ✅ 支持多个函数结果
|
| 80 |
+
}
|
| 81 |
+
steps.append(step)
|
| 82 |
+
display_html = update_steps_display(steps)
|
| 83 |
+
yield steps, f"正在执行第 {len(steps)} 步: {status}", display_html
|
| 84 |
+
time.sleep(0.3)
|
| 85 |
+
|
| 86 |
+
yield steps, f"任务完成!共执行 {len(steps)} 步。", update_steps_display(steps)
|
| 87 |
+
|
| 88 |
+
# === 更新步骤展示的 HTML ===
|
| 89 |
+
def update_steps_display(steps):
|
| 90 |
+
"""以HTML格式显示每一步的推理、函数动作及截图"""
|
| 91 |
+
if not steps:
|
| 92 |
+
return "<p>暂无执行步骤</p>"
|
| 93 |
+
|
| 94 |
+
html = "<div style='font-family: Arial, sans-serif;'>"
|
| 95 |
+
for step in steps:
|
| 96 |
+
status_color = {
|
| 97 |
+
"COMPLETE": "green",
|
| 98 |
+
"CONTINUE": "orange",
|
| 99 |
+
"FAILURE": "red",
|
| 100 |
+
}.get(step["status"], "black")
|
| 101 |
+
|
| 102 |
+
html += f"""
|
| 103 |
+
<div style='border:1px solid #ddd; margin:10px 0; padding:15px; border-radius:8px;'>
|
| 104 |
+
<div style='display:flex; justify-content:space-between; align-items:center;'>
|
| 105 |
+
<h3 style='margin:0;'>步骤 {step['step']}</h3>
|
| 106 |
+
<span style='color:{status_color}; font-weight:bold;'>{step['status']}</span>
|
| 107 |
+
</div>
|
| 108 |
+
<p><strong>推理:</strong> {step['reasoning']}</p>
|
| 109 |
+
"""
|
| 110 |
+
|
| 111 |
+
# ✅ 支持多个函数动作展示
|
| 112 |
+
for idx, func in enumerate(step["functions"], start=1):
|
| 113 |
+
# print(func)
|
| 114 |
+
html += f"""
|
| 115 |
+
<div style="word-wrap: break-word; word-break: break-all; white-space: pre-wrap; border:1px solid #ddd; border-radius:6px; padding:8px; margin-top:6px;">
|
| 116 |
+
<p><strong>函数调用 {idx}:</strong></p>
|
| 117 |
+
<p>动作: {func['action']}</p>
|
| 118 |
+
<p>返回: {func['response']}</p>
|
| 119 |
+
"""
|
| 120 |
+
if func["screenshot"]:
|
| 121 |
+
html += f"<img src='{func['screenshot']}' style='max-width:100%; border:1px solid #ccc; border-radius:6px;'/>"
|
| 122 |
+
html += "</div>"
|
| 123 |
+
|
| 124 |
+
html += "</div>" # 结束step块
|
| 125 |
+
html += "</div>"
|
| 126 |
+
return html
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def resolve_prompt(selected_value):
|
| 131 |
+
"""如果选择的是预定义 key,则返回对应 value,否则直接返回用户输入"""
|
| 132 |
+
if selected_value in prompt_options:
|
| 133 |
+
return prompt_options[selected_value]
|
| 134 |
+
return selected_value or ""
|
| 135 |
+
|
| 136 |
+
# === 创建 Gradio 界面 ===
|
| 137 |
+
def create_demo_interface():
|
| 138 |
+
with gr.Blocks(title="CUA 在线试用系统", theme=gr.themes.Soft()) as demo:
|
| 139 |
+
gr.Markdown("""
|
| 140 |
+
<div style="font-family: 'Microsoft YaHei', sans-serif; font-size: 18px; line-height: 1.6;">
|
| 141 |
+
<h1 style="color: #2E86AB; font-size: 28px; font-weight: bold; text-align: center;">🧭 CUA 在线试用系统</h1>
|
| 142 |
+
<p style="color: #555; font-size: 16px; text-align: center;">
|
| 143 |
+
输入一个任务描述,Agent 将自动进行 genflow 中浏览、截图、分析并执行下一步操作。
|
| 144 |
+
</p>
|
| 145 |
+
</div>
|
| 146 |
+
""")
|
| 147 |
+
|
| 148 |
+
with gr.Row():
|
| 149 |
+
with gr.Column(scale=1):
|
| 150 |
+
query_input = gr.Textbox(
|
| 151 |
+
label="需要在 genflow 中测试的问题",
|
| 152 |
+
placeholder="例如:帮我搜索AI新闻 / 购买一台笔记本电脑",
|
| 153 |
+
lines=2)
|
| 154 |
+
|
| 155 |
+
prompt_dropdown = gr.Dropdown(
|
| 156 |
+
label="选择或在右侧系统提示词中输入评估标准",
|
| 157 |
+
choices=list(prompt_options.keys()),
|
| 158 |
+
value=None,
|
| 159 |
+
allow_custom_value=False, # ✅ 允许用户手动输入文本
|
| 160 |
+
)
|
| 161 |
+
submit_btn = gr.Button("开始执行", variant="primary")
|
| 162 |
+
|
| 163 |
+
with gr.Column(scale=2):
|
| 164 |
+
status_output = gr.Textbox(label="执行状态", interactive=False)
|
| 165 |
+
final_prompt = gr.Textbox(label="系统提示词", interactive=True,lines=5)
|
| 166 |
+
# 按钮或自动触发逻辑
|
| 167 |
+
prompt_dropdown.change(resolve_prompt, inputs=prompt_dropdown, outputs=final_prompt)
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
# ✅ 把详细执行过程放在 JSON 前面
|
| 173 |
+
with gr.Accordion("详细执行过程", open=True):
|
| 174 |
+
steps_display = gr.HTML()
|
| 175 |
+
|
| 176 |
+
# 再放执行步骤 JSON
|
| 177 |
+
steps_output = gr.JSON(label="执行步骤详情(JSON)")
|
| 178 |
+
|
| 179 |
+
# 流式输出绑定
|
| 180 |
+
submit_btn.click(
|
| 181 |
+
fn=process_user_query,
|
| 182 |
+
inputs=[query_input, final_prompt],
|
| 183 |
+
outputs=[steps_output, status_output, steps_display],
|
| 184 |
+
api_name="run_agent",
|
| 185 |
+
show_progress=True,
|
| 186 |
+
queue=True, # 支持流式输出
|
| 187 |
+
)
|
| 188 |
+
|
| 189 |
+
return demo
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
if __name__ == "__main__":
|
| 194 |
+
demo = create_demo_interface()
|
| 195 |
+
demo.queue() # 必须启用 queue 才能支持 yield
|
| 196 |
+
demo.launch(
|
| 197 |
+
server_name="0.0.0.0",
|
| 198 |
+
server_port=7860,
|
| 199 |
+
share=True,
|
| 200 |
+
)
|
cache/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
cache/user_data/state.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
computers/__init__.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2025 Google LLC
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
from .computer import Computer, EnvState
|
| 15 |
+
from .browserbase.browserbase import BrowserbaseComputer
|
| 16 |
+
from .playwright.playwright import PlaywrightComputer
|
| 17 |
+
|
| 18 |
+
__all__ = [
|
| 19 |
+
"Computer",
|
| 20 |
+
"EnvState",
|
| 21 |
+
"BrowserbaseComputer",
|
| 22 |
+
"PlaywrightComputer",
|
| 23 |
+
]
|
computers/browserbase/__init__.py
ADDED
|
File without changes
|
computers/browserbase/browserbase.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2025 Google LLC
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
import os
|
| 15 |
+
import termcolor
|
| 16 |
+
from ..playwright.playwright import PlaywrightComputer
|
| 17 |
+
import browserbase
|
| 18 |
+
from playwright.sync_api import sync_playwright
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class BrowserbaseComputer(PlaywrightComputer):
|
| 22 |
+
def __init__(
|
| 23 |
+
self,
|
| 24 |
+
screen_size: tuple[int, int],
|
| 25 |
+
initial_url: str = "https://www.google.com",
|
| 26 |
+
):
|
| 27 |
+
super().__init__(screen_size, initial_url)
|
| 28 |
+
|
| 29 |
+
def __enter__(self):
|
| 30 |
+
print("Creating session...")
|
| 31 |
+
|
| 32 |
+
self._playwright = sync_playwright().start()
|
| 33 |
+
self._browserbase = browserbase.Browserbase(
|
| 34 |
+
api_key=os.environ["BROWSERBASE_API_KEY"]
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
self._session = self._browserbase.sessions.create(
|
| 38 |
+
project_id=os.environ["BROWSERBASE_PROJECT_ID"],
|
| 39 |
+
browser_settings={
|
| 40 |
+
"fingerprint": {
|
| 41 |
+
"screen": {
|
| 42 |
+
"maxWidth": 1920,
|
| 43 |
+
"maxHeight": 1080,
|
| 44 |
+
"minWidth": 1024,
|
| 45 |
+
"minHeight": 768,
|
| 46 |
+
},
|
| 47 |
+
},
|
| 48 |
+
"viewport": {
|
| 49 |
+
"width": self._screen_size[0],
|
| 50 |
+
"height": self._screen_size[1],
|
| 51 |
+
},
|
| 52 |
+
},
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
self._browser = self._playwright.chromium.connect_over_cdp(
|
| 56 |
+
self._session.connect_url
|
| 57 |
+
)
|
| 58 |
+
self._context = self._browser.contexts[0]
|
| 59 |
+
self._page = self._context.pages[0]
|
| 60 |
+
self._page.goto(self._initial_url)
|
| 61 |
+
|
| 62 |
+
self._context.on("page", self._handle_new_page)
|
| 63 |
+
|
| 64 |
+
termcolor.cprint(
|
| 65 |
+
f"Session started at https://browserbase.com/sessions/{self._session.id}",
|
| 66 |
+
color="green",
|
| 67 |
+
attrs=["bold"],
|
| 68 |
+
)
|
| 69 |
+
return self
|
| 70 |
+
|
| 71 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 72 |
+
self._page.close()
|
| 73 |
+
|
| 74 |
+
if self._context:
|
| 75 |
+
self._context.close()
|
| 76 |
+
|
| 77 |
+
if self._browser:
|
| 78 |
+
self._browser.close()
|
| 79 |
+
|
| 80 |
+
self._playwright.stop()
|
computers/computer.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2025 Google LLC
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
import abc
|
| 15 |
+
import pydantic
|
| 16 |
+
from typing import Literal
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class EnvState(pydantic.BaseModel):
|
| 20 |
+
# The screenshot in PNG format.
|
| 21 |
+
screenshot: bytes
|
| 22 |
+
url: str
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class Computer(abc.ABC):
|
| 26 |
+
"""Defines an interface for environments."""
|
| 27 |
+
|
| 28 |
+
@abc.abstractmethod
|
| 29 |
+
def screen_size(self) -> tuple[int, int]:
|
| 30 |
+
"""Returns the screen size of the environment."""
|
| 31 |
+
|
| 32 |
+
@abc.abstractmethod
|
| 33 |
+
def open_web_browser(self) -> EnvState:
|
| 34 |
+
"""Opens the web browser."""
|
| 35 |
+
|
| 36 |
+
@abc.abstractmethod
|
| 37 |
+
def click_at(self, x: int, y: int) -> EnvState:
|
| 38 |
+
"""Clicks at a specific x, y coordinate on the webpage.
|
| 39 |
+
|
| 40 |
+
The 'x' and 'y' values are absolute values, scaled to the height and width of the screen.
|
| 41 |
+
"""
|
| 42 |
+
|
| 43 |
+
@abc.abstractmethod
|
| 44 |
+
def hover_at(self, x: int, y: int) -> EnvState:
|
| 45 |
+
"""Hovers at a specific x, y coordinate on the webpage.
|
| 46 |
+
|
| 47 |
+
May be used to explore sub-menus that appear on hover.
|
| 48 |
+
The 'x' and 'y' values are absolute values, scaled to the height and width of the screen.
|
| 49 |
+
"""
|
| 50 |
+
|
| 51 |
+
@abc.abstractmethod
|
| 52 |
+
def type_text_at(
|
| 53 |
+
self,
|
| 54 |
+
x: int,
|
| 55 |
+
y: int,
|
| 56 |
+
text: str,
|
| 57 |
+
press_enter: bool,
|
| 58 |
+
clear_before_typing: bool,
|
| 59 |
+
) -> EnvState:
|
| 60 |
+
"""Types text at a specific x, y coordinate.
|
| 61 |
+
|
| 62 |
+
The system automatically presses ENTER after typing. To disable this, set `press_enter` to False.
|
| 63 |
+
The system automatically clears any existing content before typing the specified `text`. To disable this, set `clear_before_typing` to False.
|
| 64 |
+
The 'x' and 'y' values are absolute values, scaled to the height and width of the screen.
|
| 65 |
+
"""
|
| 66 |
+
|
| 67 |
+
@abc.abstractmethod
|
| 68 |
+
def scroll_document(
|
| 69 |
+
self, direction: Literal["up", "down", "left", "right"]
|
| 70 |
+
) -> EnvState:
|
| 71 |
+
"""Scrolls the entire webpage "up", "down", "left" or "right" based on direction."""
|
| 72 |
+
|
| 73 |
+
@abc.abstractmethod
|
| 74 |
+
def scroll_at(
|
| 75 |
+
self,
|
| 76 |
+
x: int,
|
| 77 |
+
y: int,
|
| 78 |
+
direction: Literal["up", "down", "left", "right"],
|
| 79 |
+
magnitude: int,
|
| 80 |
+
) -> EnvState:
|
| 81 |
+
"""Scrolls up, down, right, or left at a x, y coordinate by magnitude.
|
| 82 |
+
|
| 83 |
+
The 'x' and 'y' values are absolute values, scaled to the height and width of the screen.
|
| 84 |
+
"""
|
| 85 |
+
|
| 86 |
+
@abc.abstractmethod
|
| 87 |
+
def wait_5_seconds(self) -> EnvState:
|
| 88 |
+
"""Waits for 5 seconds to allow unfinished webpage processes to complete."""
|
| 89 |
+
|
| 90 |
+
@abc.abstractmethod
|
| 91 |
+
def go_back(self) -> EnvState:
|
| 92 |
+
"""Navigates back to the previous webpage in the browser history."""
|
| 93 |
+
|
| 94 |
+
@abc.abstractmethod
|
| 95 |
+
def go_forward(self) -> EnvState:
|
| 96 |
+
"""Navigates forward to the next webpage in the browser history."""
|
| 97 |
+
|
| 98 |
+
@abc.abstractmethod
|
| 99 |
+
def search(self) -> EnvState:
|
| 100 |
+
"""Directly jumps to a search engine home page.
|
| 101 |
+
|
| 102 |
+
Used when you need to start with a search. For example, this is used when
|
| 103 |
+
the current website doesn't have the information needed or because a new
|
| 104 |
+
task is being started.
|
| 105 |
+
"""
|
| 106 |
+
|
| 107 |
+
@abc.abstractmethod
|
| 108 |
+
def navigate(self, url: str) -> EnvState:
|
| 109 |
+
"""Navigates directly to a specified URL."""
|
| 110 |
+
|
| 111 |
+
@abc.abstractmethod
|
| 112 |
+
def key_combination(self, keys: list[str]) -> EnvState:
|
| 113 |
+
"""Presses keyboard keys and combinations, such as "control+c" or "enter"."""
|
| 114 |
+
|
| 115 |
+
@abc.abstractmethod
|
| 116 |
+
def drag_and_drop(
|
| 117 |
+
self, x: int, y: int, destination_x: int, destination_y: int
|
| 118 |
+
) -> EnvState:
|
| 119 |
+
"""Drag and drop an element from a x, y coordinate to a destination destination_y, destination_x coordinate.
|
| 120 |
+
The 'x', 'y', 'destination_y' and 'destination_x' values are absolute values, scaled to the height and width of the screen.
|
| 121 |
+
"""
|
| 122 |
+
|
| 123 |
+
@abc.abstractmethod
|
| 124 |
+
def current_state(self) -> EnvState:
|
| 125 |
+
"""Returns the current state of the current webpage."""
|
computers/playwright/__init__.py
ADDED
|
File without changes
|
computers/playwright/playwright.py
ADDED
|
@@ -0,0 +1,430 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2025 Google LLC
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
import logging
|
| 15 |
+
import termcolor
|
| 16 |
+
import time
|
| 17 |
+
import os
|
| 18 |
+
import sys
|
| 19 |
+
from ..computer import (
|
| 20 |
+
Computer,
|
| 21 |
+
EnvState,
|
| 22 |
+
)
|
| 23 |
+
import playwright.sync_api
|
| 24 |
+
from playwright.sync_api import sync_playwright
|
| 25 |
+
from typing import Literal
|
| 26 |
+
import glob
|
| 27 |
+
|
| 28 |
+
# Define a mapping from the user-friendly key names to Playwright's expected key names.
|
| 29 |
+
# Playwright is generally good with case-insensitivity for these, but it's best to be canonical.
|
| 30 |
+
# See: https://playwright.dev/docs/api/class-keyboard#keyboard-press
|
| 31 |
+
# Keys like 'a', 'b', '1', '$' are passed directly.
|
| 32 |
+
PLAYWRIGHT_KEY_MAP = {
|
| 33 |
+
"backspace": "Backspace",
|
| 34 |
+
"tab": "Tab",
|
| 35 |
+
"return": "Enter", # Playwright uses 'Enter'
|
| 36 |
+
"enter": "Enter",
|
| 37 |
+
"shift": "Shift",
|
| 38 |
+
"control": "ControlOrMeta",
|
| 39 |
+
"alt": "Alt",
|
| 40 |
+
"escape": "Escape",
|
| 41 |
+
"space": "Space", # Can also just be " "
|
| 42 |
+
"pageup": "PageUp",
|
| 43 |
+
"pagedown": "PageDown",
|
| 44 |
+
"end": "End",
|
| 45 |
+
"home": "Home",
|
| 46 |
+
"left": "ArrowLeft",
|
| 47 |
+
"up": "ArrowUp",
|
| 48 |
+
"right": "ArrowRight",
|
| 49 |
+
"down": "ArrowDown",
|
| 50 |
+
"insert": "Insert",
|
| 51 |
+
"delete": "Delete",
|
| 52 |
+
"semicolon": ";", # For actual character ';'
|
| 53 |
+
"equals": "=", # For actual character '='
|
| 54 |
+
"multiply": "Multiply", # NumpadMultiply
|
| 55 |
+
"add": "Add", # NumpadAdd
|
| 56 |
+
"separator": "Separator", # Numpad specific
|
| 57 |
+
"subtract": "Subtract", # NumpadSubtract, or just '-' for character
|
| 58 |
+
"decimal": "Decimal", # NumpadDecimal, or just '.' for character
|
| 59 |
+
"divide": "Divide", # NumpadDivide, or just '/' for character
|
| 60 |
+
"f1": "F1",
|
| 61 |
+
"f2": "F2",
|
| 62 |
+
"f3": "F3",
|
| 63 |
+
"f4": "F4",
|
| 64 |
+
"f5": "F5",
|
| 65 |
+
"f6": "F6",
|
| 66 |
+
"f7": "F7",
|
| 67 |
+
"f8": "F8",
|
| 68 |
+
"f9": "F9",
|
| 69 |
+
"f10": "F10",
|
| 70 |
+
"f11": "F11",
|
| 71 |
+
"f12": "F12",
|
| 72 |
+
"command": "Meta", # 'Meta' is Command on macOS, Windows key on Windows
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
class PlaywrightComputer(Computer):
|
| 77 |
+
"""Connects to a local Playwright instance."""
|
| 78 |
+
|
| 79 |
+
def __init__(
|
| 80 |
+
self,
|
| 81 |
+
screen_size: tuple[int, int],
|
| 82 |
+
initial_url: str = "https://www.google.com",
|
| 83 |
+
search_engine_url: str = "https://www.google.com",
|
| 84 |
+
highlight_mouse: bool = False,
|
| 85 |
+
):
|
| 86 |
+
self._initial_url = initial_url
|
| 87 |
+
self._screen_size = screen_size
|
| 88 |
+
self._search_engine_url = search_engine_url
|
| 89 |
+
self._highlight_mouse = highlight_mouse
|
| 90 |
+
|
| 91 |
+
def _handle_new_page(self, new_page: playwright.sync_api.Page):
|
| 92 |
+
"""The Computer Use model only supports a single tab at the moment.
|
| 93 |
+
|
| 94 |
+
Some websites, however, try to open links in a new tab.
|
| 95 |
+
For those situations, we intercept the page-opening behavior, and instead overwrite the current page.
|
| 96 |
+
"""
|
| 97 |
+
new_url = new_page.url
|
| 98 |
+
new_page.close()
|
| 99 |
+
self._page.goto(new_url)
|
| 100 |
+
|
| 101 |
+
# del
|
| 102 |
+
def cleanup_old_screenshots(self, logs_dir="logs", max_count=30):
|
| 103 |
+
"""
|
| 104 |
+
删除旧的截图文件,只保留最新的 max_count 个
|
| 105 |
+
"""
|
| 106 |
+
# 确保日志目录存在
|
| 107 |
+
if not os.path.exists(logs_dir):
|
| 108 |
+
return
|
| 109 |
+
|
| 110 |
+
# 获取所有的png文件
|
| 111 |
+
pattern = os.path.join(logs_dir, "screenshot_*.png")
|
| 112 |
+
png_files = glob.glob(pattern)
|
| 113 |
+
|
| 114 |
+
# 如果文件数量超过最大限制,删除旧的
|
| 115 |
+
if len(png_files) > max_count:
|
| 116 |
+
# 按修改时间排序,最新的在前面
|
| 117 |
+
png_files.sort(key=os.path.getmtime, reverse=True)
|
| 118 |
+
|
| 119 |
+
# 要删除的文件(保留最新的 max_count 个)
|
| 120 |
+
files_to_delete = png_files[max_count:]
|
| 121 |
+
|
| 122 |
+
# 删除旧文件
|
| 123 |
+
for file_path in files_to_delete:
|
| 124 |
+
try:
|
| 125 |
+
os.remove(file_path)
|
| 126 |
+
except Exception as e:
|
| 127 |
+
raise ValueError(f"Failed to delete screenshots {file_path}: {e}")
|
| 128 |
+
|
| 129 |
+
def __enter__(self):
|
| 130 |
+
print("Creating session...")
|
| 131 |
+
self._playwright = sync_playwright().start()
|
| 132 |
+
self._browser = self._playwright.chromium.launch(
|
| 133 |
+
args=[
|
| 134 |
+
"--disable-extensions",
|
| 135 |
+
"--disable-file-system",
|
| 136 |
+
"--disable-plugins",
|
| 137 |
+
"--disable-dev-shm-usage",
|
| 138 |
+
"--disable-background-networking",
|
| 139 |
+
"--disable-default-apps",
|
| 140 |
+
"--disable-sync",
|
| 141 |
+
# No '--no-sandbox' arg means the sandbox is on.
|
| 142 |
+
],
|
| 143 |
+
headless=bool(os.environ.get("PLAYWRIGHT_HEADLESS", False)),
|
| 144 |
+
)
|
| 145 |
+
self.cleanup_old_screenshots()
|
| 146 |
+
os.makedirs("cache/user_data", exist_ok=True) # 定义用户数据存储路径
|
| 147 |
+
storage_state = "cache/user_data/state.json" if os.path.exists("cache/user_data/state.json") else None
|
| 148 |
+
self._context = self._browser.new_context(
|
| 149 |
+
viewport={
|
| 150 |
+
"width": self._screen_size[0],
|
| 151 |
+
"height": self._screen_size[1],
|
| 152 |
+
},
|
| 153 |
+
storage_state=storage_state
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
# user_data_dir="cache/user_data", # 保存用户信息,避免重复登录
|
| 157 |
+
self._page = self._context.new_page()
|
| 158 |
+
self._page.goto(self._initial_url, wait_until="domcontentloaded")
|
| 159 |
+
|
| 160 |
+
# # 放大110%字体大小,避免看不清
|
| 161 |
+
# self._page.evaluate("""
|
| 162 |
+
# (() => {
|
| 163 |
+
# const style = document.createElement('style');
|
| 164 |
+
# style.innerHTML = `
|
| 165 |
+
# * {
|
| 166 |
+
# font-size: 100% !important;
|
| 167 |
+
# }
|
| 168 |
+
# `;
|
| 169 |
+
# document.head.appendChild(style);
|
| 170 |
+
# })();
|
| 171 |
+
# """)
|
| 172 |
+
|
| 173 |
+
# 作为监听器,当self._context打开一个新tab时触发。拦截打开新的tab, 但记录打开的tab需要去到的url, 在当前页面中导航去到新url
|
| 174 |
+
# 这个CUM在一个时刻只支持一个tab
|
| 175 |
+
self._context.on("page", self._handle_new_page)
|
| 176 |
+
|
| 177 |
+
# 关于左侧历史消息导航的侧边拦,对齐prompt中对genflow UI的描述,如果不是导航genflow, 可以跳过
|
| 178 |
+
try:
|
| 179 |
+
chat_nav = self._page.locator('div[data-v-74198486].chat-nav')
|
| 180 |
+
is_expand = None
|
| 181 |
+
if chat_nav.count() > 0:
|
| 182 |
+
class_name = chat_nav.get_attribute("class")
|
| 183 |
+
if "show" in class_name:
|
| 184 |
+
is_expand = True
|
| 185 |
+
else:
|
| 186 |
+
is_expand = False
|
| 187 |
+
print(class_name)
|
| 188 |
+
else:
|
| 189 |
+
print("未找到左侧历史消息扩展栏元素")
|
| 190 |
+
if is_expand:
|
| 191 |
+
expand_icon = self._page.locator('.expand-icon')
|
| 192 |
+
if expand_icon.count() > 0:
|
| 193 |
+
expand_icon.click()
|
| 194 |
+
is_expand = False
|
| 195 |
+
else:
|
| 196 |
+
print("未找到左侧历史消息扩展栏点击元素")
|
| 197 |
+
except Exception as e:
|
| 198 |
+
raise ValueError("未找到左侧历史消息扩展栏元素")
|
| 199 |
+
|
| 200 |
+
termcolor.cprint(
|
| 201 |
+
f"Started local playwright.",
|
| 202 |
+
color="green",
|
| 203 |
+
attrs=["bold"],
|
| 204 |
+
)
|
| 205 |
+
return self
|
| 206 |
+
|
| 207 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 208 |
+
if self._context:
|
| 209 |
+
self._context.storage_state(path="cache/user_data/state.json") # 这里在异常结束的时候是保存不了的
|
| 210 |
+
self._context.close()
|
| 211 |
+
try:
|
| 212 |
+
self._browser.close()
|
| 213 |
+
except Exception as e:
|
| 214 |
+
# Browser was already shut down because of SIGINT or such.
|
| 215 |
+
if "Browser.close: Connection closed while reading from the driver" in str(
|
| 216 |
+
e
|
| 217 |
+
):
|
| 218 |
+
pass
|
| 219 |
+
else:
|
| 220 |
+
raise
|
| 221 |
+
|
| 222 |
+
self._playwright.stop()
|
| 223 |
+
|
| 224 |
+
def open_web_browser(self) -> EnvState:
|
| 225 |
+
return self.current_state()
|
| 226 |
+
|
| 227 |
+
def click_at(self, x: int, y: int):
|
| 228 |
+
self.highlight_mouse(x, y)
|
| 229 |
+
self._page.mouse.click(x, y)
|
| 230 |
+
self._page.wait_for_load_state()
|
| 231 |
+
return self.current_state()
|
| 232 |
+
|
| 233 |
+
def hover_at(self, x: int, y: int):
|
| 234 |
+
self.highlight_mouse(x, y)
|
| 235 |
+
self._page.mouse.move(x, y)
|
| 236 |
+
self._page.wait_for_load_state()
|
| 237 |
+
return self.current_state()
|
| 238 |
+
|
| 239 |
+
def type_text_at(
|
| 240 |
+
self,
|
| 241 |
+
x: int,
|
| 242 |
+
y: int,
|
| 243 |
+
text: str,
|
| 244 |
+
press_enter: bool = False,
|
| 245 |
+
clear_before_typing: bool = True,
|
| 246 |
+
) -> EnvState:
|
| 247 |
+
self.highlight_mouse(x, y)
|
| 248 |
+
self._page.mouse.click(x, y)
|
| 249 |
+
self._page.wait_for_load_state()
|
| 250 |
+
|
| 251 |
+
if clear_before_typing:
|
| 252 |
+
if sys.platform == "darwin":
|
| 253 |
+
self.key_combination(["Command", "A"])
|
| 254 |
+
else:
|
| 255 |
+
self.key_combination(["Control", "A"])
|
| 256 |
+
self.key_combination(["Delete"])
|
| 257 |
+
|
| 258 |
+
self._page.keyboard.type(text)
|
| 259 |
+
self._page.wait_for_load_state()
|
| 260 |
+
|
| 261 |
+
if press_enter:
|
| 262 |
+
self.key_combination(["Enter"])
|
| 263 |
+
self._page.wait_for_load_state()
|
| 264 |
+
return self.current_state()
|
| 265 |
+
|
| 266 |
+
def _horizontal_document_scroll(
|
| 267 |
+
self, direction: Literal["left", "right"]
|
| 268 |
+
) -> EnvState:
|
| 269 |
+
# Scroll by 50% of the viewport size.
|
| 270 |
+
horizontal_scroll_amount = self.screen_size()[0] // 2
|
| 271 |
+
if direction == "left":
|
| 272 |
+
sign = "-"
|
| 273 |
+
else:
|
| 274 |
+
sign = ""
|
| 275 |
+
scroll_argument = f"{sign}{horizontal_scroll_amount}"
|
| 276 |
+
# Scroll using JS.
|
| 277 |
+
self._page.evaluate(f"window.scrollBy({scroll_argument}, 0); ")
|
| 278 |
+
self._page.wait_for_load_state()
|
| 279 |
+
return self.current_state()
|
| 280 |
+
|
| 281 |
+
def scroll_document(
|
| 282 |
+
self, direction: Literal["up", "down", "left", "right"]
|
| 283 |
+
) -> EnvState:
|
| 284 |
+
if direction == "down":
|
| 285 |
+
return self.key_combination(["PageDown"])
|
| 286 |
+
elif direction == "up":
|
| 287 |
+
return self.key_combination(["PageUp"])
|
| 288 |
+
elif direction in ("left", "right"):
|
| 289 |
+
return self._horizontal_document_scroll(direction)
|
| 290 |
+
else:
|
| 291 |
+
raise ValueError("Unsupported direction: ", direction)
|
| 292 |
+
|
| 293 |
+
def scroll_at(
|
| 294 |
+
self,
|
| 295 |
+
x: int,
|
| 296 |
+
y: int,
|
| 297 |
+
direction: Literal["up", "down", "left", "right"],
|
| 298 |
+
magnitude: int = 800,
|
| 299 |
+
) -> EnvState:
|
| 300 |
+
self.highlight_mouse(x, y)
|
| 301 |
+
|
| 302 |
+
self._page.mouse.move(x, y)
|
| 303 |
+
self._page.wait_for_load_state()
|
| 304 |
+
|
| 305 |
+
dx = 0
|
| 306 |
+
dy = 0
|
| 307 |
+
if direction == "up":
|
| 308 |
+
dy = -magnitude
|
| 309 |
+
elif direction == "down":
|
| 310 |
+
dy = magnitude
|
| 311 |
+
elif direction == "left":
|
| 312 |
+
dx = -magnitude
|
| 313 |
+
elif direction == "right":
|
| 314 |
+
dx = magnitude
|
| 315 |
+
else:
|
| 316 |
+
raise ValueError("Unsupported direction: ", direction)
|
| 317 |
+
|
| 318 |
+
self._page.mouse.wheel(dx, dy)
|
| 319 |
+
self._page.wait_for_load_state()
|
| 320 |
+
return self.current_state()
|
| 321 |
+
|
| 322 |
+
def wait_5_seconds(self) -> EnvState:
|
| 323 |
+
# del
|
| 324 |
+
time.sleep(20)
|
| 325 |
+
return self.current_state()
|
| 326 |
+
|
| 327 |
+
def go_back(self) -> EnvState:
|
| 328 |
+
self._page.go_back()
|
| 329 |
+
self._page.wait_for_load_state()
|
| 330 |
+
return self.current_state()
|
| 331 |
+
|
| 332 |
+
def go_forward(self) -> EnvState:
|
| 333 |
+
self._page.go_forward()
|
| 334 |
+
self._page.wait_for_load_state()
|
| 335 |
+
return self.current_state()
|
| 336 |
+
|
| 337 |
+
def search(self) -> EnvState:
|
| 338 |
+
return self.navigate(self._search_engine_url)
|
| 339 |
+
|
| 340 |
+
def navigate(self, url: str) -> EnvState:
|
| 341 |
+
normalized_url = url
|
| 342 |
+
if not normalized_url.startswith(("http://", "https://")):
|
| 343 |
+
normalized_url = "https://" + normalized_url
|
| 344 |
+
self._page.goto(normalized_url)
|
| 345 |
+
self._page.wait_for_load_state()
|
| 346 |
+
return self.current_state()
|
| 347 |
+
|
| 348 |
+
def key_combination(self, keys: list[str]) -> EnvState:
|
| 349 |
+
# Normalize all keys to the Playwright compatible version.
|
| 350 |
+
keys = [PLAYWRIGHT_KEY_MAP.get(k.lower(), k) for k in keys]
|
| 351 |
+
|
| 352 |
+
for key in keys[:-1]:
|
| 353 |
+
self._page.keyboard.down(key)
|
| 354 |
+
|
| 355 |
+
self._page.keyboard.press(keys[-1])
|
| 356 |
+
|
| 357 |
+
for key in reversed(keys[:-1]):
|
| 358 |
+
self._page.keyboard.up(key)
|
| 359 |
+
|
| 360 |
+
return self.current_state()
|
| 361 |
+
|
| 362 |
+
def drag_and_drop(
|
| 363 |
+
self, x: int, y: int, destination_x: int, destination_y: int
|
| 364 |
+
) -> EnvState:
|
| 365 |
+
self.highlight_mouse(x, y)
|
| 366 |
+
self._page.mouse.move(x, y)
|
| 367 |
+
self._page.wait_for_load_state()
|
| 368 |
+
self._page.mouse.down()
|
| 369 |
+
self._page.wait_for_load_state()
|
| 370 |
+
|
| 371 |
+
self.highlight_mouse(destination_x, destination_y)
|
| 372 |
+
self._page.mouse.move(destination_x, destination_y)
|
| 373 |
+
self._page.wait_for_load_state()
|
| 374 |
+
self._page.mouse.up()
|
| 375 |
+
return self.current_state()
|
| 376 |
+
|
| 377 |
+
def current_state(self) -> EnvState:
|
| 378 |
+
self._page.wait_for_load_state()
|
| 379 |
+
# Even if Playwright reports the page as loaded, it may not be so.
|
| 380 |
+
# Add a manual sleep to make sure the page has finished rendering.
|
| 381 |
+
time.sleep(0.5)
|
| 382 |
+
screenshot_bytes = self._page.screenshot(type="png", full_page=False)
|
| 383 |
+
|
| 384 |
+
# del 保存截图
|
| 385 |
+
from datetime import datetime
|
| 386 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 387 |
+
filename = f"logs/screenshot_{timestamp}.png"
|
| 388 |
+
with open(filename, "wb") as f:
|
| 389 |
+
f.write(screenshot_bytes)
|
| 390 |
+
|
| 391 |
+
return EnvState(screenshot=screenshot_bytes, url=self._page.url)
|
| 392 |
+
|
| 393 |
+
def screen_size(self) -> tuple[int, int]:
|
| 394 |
+
viewport_size = self._page.viewport_size
|
| 395 |
+
# If available, try to take the local playwright viewport size.
|
| 396 |
+
if viewport_size:
|
| 397 |
+
return viewport_size["width"], viewport_size["height"]
|
| 398 |
+
# If unavailable, fall back to the original provided size.
|
| 399 |
+
return self._screen_size
|
| 400 |
+
|
| 401 |
+
def highlight_mouse(self, x: int, y: int):
|
| 402 |
+
if not self._highlight_mouse:
|
| 403 |
+
return
|
| 404 |
+
self._page.evaluate(
|
| 405 |
+
f"""
|
| 406 |
+
() => {{
|
| 407 |
+
const element_id = "playwright-feedback-circle";
|
| 408 |
+
const div = document.createElement('div');
|
| 409 |
+
div.id = element_id;
|
| 410 |
+
div.style.pointerEvents = 'none';
|
| 411 |
+
div.style.border = '4px solid red';
|
| 412 |
+
div.style.borderRadius = '50%';
|
| 413 |
+
div.style.width = '20px';
|
| 414 |
+
div.style.height = '20px';
|
| 415 |
+
div.style.position = 'fixed';
|
| 416 |
+
div.style.zIndex = '9999';
|
| 417 |
+
document.body.appendChild(div);
|
| 418 |
+
|
| 419 |
+
div.hidden = false;
|
| 420 |
+
div.style.left = {x} - 10 + 'px';
|
| 421 |
+
div.style.top = {y} - 10 + 'px';
|
| 422 |
+
|
| 423 |
+
setTimeout(() => {{
|
| 424 |
+
div.hidden = true;
|
| 425 |
+
}}, 2000);
|
| 426 |
+
}}
|
| 427 |
+
"""
|
| 428 |
+
)
|
| 429 |
+
# Wait a bit for the user to see the cursor.
|
| 430 |
+
time.sleep(1)
|
genflow.py
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2025 Google LLC
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
import argparse
|
| 15 |
+
import os
|
| 16 |
+
|
| 17 |
+
from agent import BrowserAgent
|
| 18 |
+
from computers import BrowserbaseComputer, PlaywrightComputer
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
PLAYWRIGHT_SCREEN_SIZE = (1440, 900)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def main() -> int:
|
| 25 |
+
parser = argparse.ArgumentParser(description="Run the browser agent with a query.")
|
| 26 |
+
parser.add_argument(
|
| 27 |
+
"--query",
|
| 28 |
+
type=str,
|
| 29 |
+
required=True,
|
| 30 |
+
help="The query for the browser agent to execute.",
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
parser.add_argument(
|
| 34 |
+
"--env",
|
| 35 |
+
type=str,
|
| 36 |
+
choices=("playwright", "browserbase"),
|
| 37 |
+
default="playwright",
|
| 38 |
+
help="The computer use environment to use.",
|
| 39 |
+
)
|
| 40 |
+
parser.add_argument(
|
| 41 |
+
"--initial_url",
|
| 42 |
+
type=str,
|
| 43 |
+
default="https://www.google.com",
|
| 44 |
+
help="The inital URL loaded for the computer.",
|
| 45 |
+
)
|
| 46 |
+
parser.add_argument(
|
| 47 |
+
"--highlight_mouse",
|
| 48 |
+
action="store_true",
|
| 49 |
+
default=False,
|
| 50 |
+
help="If possible, highlight the location of the mouse.",
|
| 51 |
+
)
|
| 52 |
+
parser.add_argument(
|
| 53 |
+
"--model",
|
| 54 |
+
default='gemini-2.5-computer-use-preview-10-2025',
|
| 55 |
+
help="Set which main model to use.",
|
| 56 |
+
)
|
| 57 |
+
args = parser.parse_args()
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
if args.env == "playwright":
|
| 61 |
+
env = PlaywrightComputer(
|
| 62 |
+
screen_size=PLAYWRIGHT_SCREEN_SIZE,
|
| 63 |
+
initial_url=args.initial_url,
|
| 64 |
+
highlight_mouse=args.highlight_mouse,
|
| 65 |
+
)
|
| 66 |
+
elif args.env == "browserbase":
|
| 67 |
+
env = BrowserbaseComputer(
|
| 68 |
+
screen_size=PLAYWRIGHT_SCREEN_SIZE,
|
| 69 |
+
initial_url=args.initial_url
|
| 70 |
+
)
|
| 71 |
+
else:
|
| 72 |
+
raise ValueError("Unknown environment: ", args.env)
|
| 73 |
+
|
| 74 |
+
with env as browser_computer:
|
| 75 |
+
agent = BrowserAgent(
|
| 76 |
+
browser_computer=browser_computer,
|
| 77 |
+
query=args.query,
|
| 78 |
+
model_name=args.model,
|
| 79 |
+
)
|
| 80 |
+
agent.agent_loop()
|
| 81 |
+
return 0
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def run_genflow():
|
| 85 |
+
env = PlaywrightComputer(
|
| 86 |
+
screen_size=PLAYWRIGHT_SCREEN_SIZE,
|
| 87 |
+
initial_url="https://wenku.baidu.com/ndcore/browse/aiunion?fr=options_AIcard_1&_wkts_=1761290807747&bdQuery=genflow&t=1761290807744&tabType=genflow&aiCreat=genflow",
|
| 88 |
+
# initial_url="https://www.doubao.com/chat/",
|
| 89 |
+
highlight_mouse=True, # 如果指定,代理将尝试在屏幕截图中突出显示鼠标光标的位置。这对于可视化调试很有用。
|
| 90 |
+
)
|
| 91 |
+
# query="在提示文本为'输入问题,交给GenFlow的'搜索条中输入‘百度’,点击发送按钮",
|
| 92 |
+
search_query = [
|
| 93 |
+
"《劳动法》中关于加班与调休的相关规定解读",
|
| 94 |
+
"不到喀什游不算到新疆,不到古城游不算到喀什,最后一天来一场深度的早到晚古城游!",
|
| 95 |
+
"为何说地球正处在5大灾难性生态临界点?",
|
| 96 |
+
"为什么要进行展示设计案例分析",
|
| 97 |
+
"镜子为什么不能正对床?",
|
| 98 |
+
"鹅绒羽绒服比鸭绒羽绒服好在哪?",
|
| 99 |
+
"如何从根本解决宿舍问题,改善职工住宿条件?",
|
| 100 |
+
"独生子女为什么不能全部继承父母遗产?",
|
| 101 |
+
"写一篇忧郁文艺小诗 一节六句话",
|
| 102 |
+
"9.3阅兵有哪些国家参加",
|
| 103 |
+
"今年戛纳电影节有哪些国家的影片参展",
|
| 104 |
+
"最近的世界移动通信大会有哪些国家的企业参会",
|
| 105 |
+
"今年日本东京国际动漫节吸引了哪些国家的动漫团队参与",
|
| 106 |
+
"今年年大阪·关西世界园艺博览会有哪些国家设立展馆",
|
| 107 |
+
"最近德国法兰克福国际车展有哪些国家的汽车品牌亮相",
|
| 108 |
+
"第九届世界无人机大会有哪些国家的企业和团队参加",
|
| 109 |
+
"最近亚洲物流展CeMAT有哪些国家的物流企业参展",
|
| 110 |
+
"最近世界教育创新峰会有哪些国家的教育机构代表出席",
|
| 111 |
+
"2025年意大利美食节有哪些国家带来特色美食展示"
|
| 112 |
+
]
|
| 113 |
+
query = "帮我生成一份关于新能源领域的投资报告"
|
| 114 |
+
citerion = """
|
| 115 |
+
核心评价维度:
|
| 116 |
+
|
| 117 |
+
1. 内容与逻辑 (灵魂)
|
| 118 |
+
- 清晰的投资论点: 报告是否有明确、可验证的“买入/卖出”观点及目标价?观点是否新颖,提供了市场尚未察觉的洞察?
|
| 119 |
+
- 严谨的逻辑链条: 从行业分析到公司研究,再到财务估值,所有论据是否像积木一样牢固支撑核心论点?推理过程是否严密,无逻辑跳跃?
|
| 120 |
+
- 研究的深度与广度: 是否覆盖了行业(格局、驱动)、公司���模式、护城河)、财务(三张表、比率)和估值(假设合理性)?
|
| 121 |
+
|
| 122 |
+
2. 数据与事实 (基石)
|
| 123 |
+
- 准确性与时效性: 数据是否准确?来源(年报、权威统计)是否可靠?信息是否最新?
|
| 124 |
+
- 相关性与解读: 数据是否与论点强相关?是否对数据背后的业务含义进行了深入解读,而非简单罗列?
|
| 125 |
+
|
| 126 |
+
3. 结构与表达 (骨架)
|
| 127 |
+
- 结构清晰: 是否有清晰的摘要、目录和逻辑流程?(例如:摘要->行业->公司->财务->估值->风险->结论)
|
| 128 |
+
- 语言专业: 语言是否精炼、客观、准确?专业术语使用是否恰当?
|
| 129 |
+
- 可视化: 图表是否清晰、直观?排版是否专业,重点突出?
|
| 130 |
+
|
| 131 |
+
4. 客观性与风险意识 (灵魂试金石)
|
| 132 |
+
- 客观中立: 是否同时呈现了正反论据?是否存在明显偏见?
|
| 133 |
+
- 全面的风险分析: 是否识别并深入分析了关键的下行风险(行业、公司、宏观、估值)?是否讨论了风险对投资论点的影响?"""
|
| 134 |
+
|
| 135 |
+
query = "查找5个简历模板链接"
|
| 136 |
+
citerion = "通过访问链接确保每一条都是有效的, 并且资源链接的数量是否满足5条。有时候链接通过页面卡片提供,你需要通过点击卡片进行链接跳转。"
|
| 137 |
+
prompt = f"""
|
| 138 |
+
GenFlow是一个AI聊天机器人。对于给定的问题,他会执行'任务规划','任务执行','输出结果'三个步骤。
|
| 139 |
+
在GenFlow执行每个步骤时,'任务规划','任务执行','输出结果'会分别呈现黑色或灰色。黑色代表该步骤完成,灰色代表该步骤未完成。
|
| 140 |
+
你需要作为测试员,在它的网页底部搜索栏中输入用户提交的问题,在GenFlow执行完三个任务后,客观地判断其最终是否按照给定标准:{citerion} 回答了问题。
|
| 141 |
+
|
| 142 |
+
GenFlow返回的内容可能比较长,你可以多次执行'scroll_at'操作来查看网页中它返回的上下文。
|
| 143 |
+
GenFlow有时会呈现左边主页面,右边预览区域。在这种情况下执行'scroll_at'的时候你要注意鼠标位置。
|
| 144 |
+
通过反复的滚动,确保主页面和预览区域已经滚动到底。预览区域往往很长,所以你需要多次执行'scroll_at'操作,确保2次滚动操作看到的网页完全一样为止, 给出最终客观评价.
|
| 145 |
+
|
| 146 |
+
GenFlow一定会给出'输出结果',请保持足够的耐心!
|
| 147 |
+
Regardless of the input language, you need to output in Chinese.
|
| 148 |
+
"""
|
| 149 |
+
|
| 150 |
+
citerion = """提交任务"1+1,2+3和4+5",判断结果分别是不是2,5,9"""
|
| 151 |
+
query = f"""比较genflow和doubao哪个在数学计算上的能力更好。标准: {citerion}"""
|
| 152 |
+
prompt = f"""
|
| 153 |
+
GenFlow和DouBao是2个AI聊天机器人。
|
| 154 |
+
你需要作为测试员,根据用户需求在它们的网页端搜索栏中输入问题,评测它们的能力
|
| 155 |
+
初始状态你处于DouBao的首页,在测试完毕后,你需要切换到GenFlow的首页进行测试。然后给出最终结论
|
| 156 |
+
DouBao的url是 <url>https://www.doubao.com/chat/</url>
|
| 157 |
+
GenFlow的url是 <url>https://wenku.baidu.com/ndcore/browse/aiunion?fr=options_AIcard_1&_wkts_=1761290807747&bdQuery=genflow&t=1761290807744&tabType=genflow&aiCreat=genflow"</url>
|
| 158 |
+
|
| 159 |
+
AI返回的内容可能比较长,你可以多次执行'scroll_at'操作来查看网页中它们返回的上下文。
|
| 160 |
+
你需要多次执行'scroll_at'操作,确保2次滚动操作看到的网页完全一样为止, 给出最终客观评价.
|
| 161 |
+
|
| 162 |
+
Regardless of the input language, you need to output in Chinese.
|
| 163 |
+
"""
|
| 164 |
+
|
| 165 |
+
with env as browser_computer:
|
| 166 |
+
agent = BrowserAgent(
|
| 167 |
+
browser_computer=browser_computer,
|
| 168 |
+
query=query,
|
| 169 |
+
system_prompt=prompt,
|
| 170 |
+
model_name='gemini-2.5-computer-use-preview-10-2025',
|
| 171 |
+
)
|
| 172 |
+
agent.agent_loop()
|
| 173 |
+
# pass
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
if __name__ == "__main__":
|
| 178 |
+
run_genflow()
|
logs/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
logs/screenshot_20251103_212455.png
ADDED
|
Git LFS Details
|
logs/screenshot_20251103_212504.png
ADDED
|
Git LFS Details
|
logs/screenshot_20251103_212514.png
ADDED
|
Git LFS Details
|
logs/screenshot_20251103_212605.png
ADDED
|
Git LFS Details
|
logs/screenshot_20251103_212612.png
ADDED
|
Git LFS Details
|
logs/screenshot_20251103_212613.png
ADDED
|
Git LFS Details
|
logs/screenshot_20251103_212614.png
ADDED
|
logs/screenshot_20251103_212622.png
ADDED
|
Git LFS Details
|
logs/screenshot_20251103_212631.png
ADDED
|
Git LFS Details
|
logs/screenshot_20251103_212732.png
ADDED
|
Git LFS Details
|
logs/screenshot_20251103_212738.png
ADDED
|
Git LFS Details
|
logs/screenshot_20251103_212739.png
ADDED
|
Git LFS Details
|
logs/screenshot_20251103_212740.png
ADDED
|
logs/screenshot_20251103_212750.png
ADDED
|
logs/screenshot_20251103_212759.png
ADDED
|
Git LFS Details
|
logs/screenshot_20251103_212809.png
ADDED
|
Git LFS Details
|
logs/screenshot_20251103_212949.png
ADDED
|
Git LFS Details
|
logs/screenshot_20251103_212956.png
ADDED
|
Git LFS Details
|
logs/screenshot_20251103_212957.png
ADDED
|
Git LFS Details
|
logs/screenshot_20251103_212958.png
ADDED
|
logs/screenshot_20251103_213005.png
ADDED
|
logs/screenshot_20251103_213032.png
ADDED
|
Git LFS Details
|
logs/screenshot_20251103_213042.png
ADDED
|
Git LFS Details
|
logs/screenshot_20251103_213052.png
ADDED
|
Git LFS Details
|
logs/screenshot_20251104_155252.png
ADDED
|
Git LFS Details
|
logs/screenshot_20251104_155301.png
ADDED
|
Git LFS Details
|
logs/screenshot_20251104_155302.png
ADDED
|
Git LFS Details
|
logs/screenshot_20251104_155303.png
ADDED
|
logs/screenshot_20251104_155330.png
ADDED
|
Git LFS Details
|
logs/screenshot_20251104_155340.png
ADDED
|
Git LFS Details
|