You've already forked adk-python
mirror of
https://github.com/encounter/adk-python.git
synced 2026-03-30 10:57:20 -07:00
chore: Add computer use sample agent
PiperOrigin-RevId: 820407078
This commit is contained in:
committed by
Copybara-Service
parent
37a153ef94
commit
2a8fdd94e1
@@ -0,0 +1,96 @@
|
||||
# Computer Use Agent
|
||||
|
||||
This directory contains a computer use agent that can operate a browser to complete user tasks. The agent uses Playwright to control a Chromium browser and can interact with web pages by taking screenshots, clicking, typing, and navigating.
|
||||
|
||||
This agent is to demo the usage of ComputerUseToolset.
|
||||
|
||||
|
||||
## Overview
|
||||
|
||||
The computer use agent consists of:
|
||||
- `agent.py`: Main agent configuration using Google's gemini-2.5-computer-use-preview-10-2025 model
|
||||
- `playwright.py`: Playwright-based computer implementation for browser automation
|
||||
- `requirements.txt`: Python dependencies
|
||||
|
||||
## Setup
|
||||
|
||||
### 1. Install Python Dependencies
|
||||
|
||||
Install the required Python packages from the requirements file:
|
||||
|
||||
```bash
|
||||
uv pip install -r internal/samples/computer_use/requirements.txt
|
||||
```
|
||||
|
||||
### 2. Install Playwright Dependencies
|
||||
|
||||
Install Playwright's system dependencies for Chromium:
|
||||
|
||||
```bash
|
||||
playwright install-deps chromium
|
||||
```
|
||||
|
||||
### 3. Install Chromium Browser
|
||||
|
||||
Install the Chromium browser for Playwright:
|
||||
|
||||
```bash
|
||||
playwright install chromium
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Running the Agent
|
||||
|
||||
To start the computer use agent, run the following command from the project root:
|
||||
|
||||
```bash
|
||||
adk web internal/samples
|
||||
```
|
||||
|
||||
This will start the ADK web interface where you can interact with the computer_use agent.
|
||||
|
||||
### Example Queries
|
||||
|
||||
Once the agent is running, you can send queries like:
|
||||
|
||||
```
|
||||
find me a flight from SF to Hawaii on next Monday, coming back on next Friday. start by navigating directly to flights.google.com
|
||||
```
|
||||
|
||||
The agent will:
|
||||
1. Open a browser window
|
||||
2. Navigate to the specified website
|
||||
3. Interact with the page elements to complete your task
|
||||
4. Provide updates on its progress
|
||||
|
||||
### Other Example Tasks
|
||||
|
||||
- Book hotel reservations
|
||||
- Search for products online
|
||||
- Fill out forms
|
||||
- Navigate complex websites
|
||||
- Research information across multiple pages
|
||||
|
||||
## Technical Details
|
||||
|
||||
- **Model**: Uses Google's `gemini-2.5-computer-use-preview-10-2025` model for computer use capabilities
|
||||
- **Browser**: Automated Chromium browser via Playwright
|
||||
- **Screen Size**: Configured for 600x800 resolution
|
||||
- **Tools**: Uses ComputerUseToolset for screen capture, clicking, typing, and scrolling
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
If you encounter issues:
|
||||
|
||||
1. **Playwright not found**: Make sure you've run both `playwright install-deps chromium` and `playwright install chromium`
|
||||
2. **Dependencies missing**: Verify all packages from `requirements.txt` are installed
|
||||
3. **Browser crashes**: Check that your system supports Chromium and has sufficient resources
|
||||
4. **Permission errors**: Ensure your user has permission to run browser automation tools
|
||||
|
||||
## Notes
|
||||
|
||||
- The agent operates in a controlled browser environment
|
||||
- Screenshots are taken to help the agent understand the current state
|
||||
- The agent will provide updates on its actions as it works
|
||||
- Be patient as complex tasks may take some time to complete
|
||||
Executable
+35
@@ -0,0 +1,35 @@
|
||||
# Copyright 2025 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from google.adk import Agent
|
||||
from google.adk.models.google_llm import Gemini
|
||||
from google.adk.tools.computer_use.computer_use_toolset import ComputerUseToolset
|
||||
from typing_extensions import override
|
||||
|
||||
from .playwright import PlaywrightComputer
|
||||
|
||||
root_agent = Agent(
|
||||
model='gemini-2.5-computer-use-preview-10-2025',
|
||||
name='hello_world_agent',
|
||||
description=(
|
||||
'computer use agent that can operate a browser on a computer to finish'
|
||||
' user tasks'
|
||||
),
|
||||
instruction="""
|
||||
you are a computer use agent
|
||||
""",
|
||||
tools=[
|
||||
ComputerUseToolset(computer=PlaywrightComputer(screen_size=(1280, 936)))
|
||||
],
|
||||
)
|
||||
@@ -0,0 +1,317 @@
|
||||
# Copyright 2025 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import asyncio
|
||||
import time
|
||||
from typing import Literal
|
||||
|
||||
from google.adk.tools.computer_use.base_computer import BaseComputer
|
||||
from google.adk.tools.computer_use.base_computer import ComputerEnvironment
|
||||
from google.adk.tools.computer_use.base_computer import ComputerState
|
||||
from playwright.async_api import async_playwright
|
||||
import termcolor
|
||||
from typing_extensions import override
|
||||
|
||||
# Define a mapping from the user-friendly key names to Playwright's expected key names.
|
||||
# Playwright is generally good with case-insensitivity for these, but it's best to be canonical.
|
||||
# See: https://playwright.dev/docs/api/class-keyboard#keyboard-press
|
||||
# Keys like 'a', 'b', '1', '$' are passed directly.
|
||||
PLAYWRIGHT_KEY_MAP = {
|
||||
"backspace": "Backspace",
|
||||
"tab": "Tab",
|
||||
"return": "Enter", # Playwright uses 'Enter'
|
||||
"enter": "Enter",
|
||||
"shift": "Shift",
|
||||
"control": "Control", # Or 'ControlOrMeta' for cross-platform Ctrl/Cmd
|
||||
"alt": "Alt",
|
||||
"escape": "Escape",
|
||||
"space": "Space", # Can also just be " "
|
||||
"pageup": "PageUp",
|
||||
"pagedown": "PageDown",
|
||||
"end": "End",
|
||||
"home": "Home",
|
||||
"left": "ArrowLeft",
|
||||
"up": "ArrowUp",
|
||||
"right": "ArrowRight",
|
||||
"down": "ArrowDown",
|
||||
"insert": "Insert",
|
||||
"delete": "Delete",
|
||||
"semicolon": ";", # For actual character ';'
|
||||
"equals": "=", # For actual character '='
|
||||
"multiply": "Multiply", # NumpadMultiply
|
||||
"add": "Add", # NumpadAdd
|
||||
"separator": "Separator", # Numpad specific
|
||||
"subtract": "Subtract", # NumpadSubtract, or just '-' for character
|
||||
"decimal": "Decimal", # NumpadDecimal, or just '.' for character
|
||||
"divide": "Divide", # NumpadDivide, or just '/' for character
|
||||
"f1": "F1",
|
||||
"f2": "F2",
|
||||
"f3": "F3",
|
||||
"f4": "F4",
|
||||
"f5": "F5",
|
||||
"f6": "F6",
|
||||
"f7": "F7",
|
||||
"f8": "F8",
|
||||
"f9": "F9",
|
||||
"f10": "F10",
|
||||
"f11": "F11",
|
||||
"f12": "F12",
|
||||
"command": "Meta", # 'Meta' is Command on macOS, Windows key on Windows
|
||||
}
|
||||
|
||||
|
||||
class PlaywrightComputer(BaseComputer):
|
||||
"""Conputer that controls Chromium via Playwright."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
screen_size: tuple[int, int],
|
||||
initial_url: str = "https://www.google.com",
|
||||
search_engine_url: str = "https://www.google.com",
|
||||
highlight_mouse: bool = False,
|
||||
):
|
||||
self._initial_url = initial_url
|
||||
self._screen_size = screen_size
|
||||
self._search_engine_url = search_engine_url
|
||||
self._highlight_mouse = highlight_mouse
|
||||
|
||||
@override
|
||||
async def initialize(self):
|
||||
print("Creating session...")
|
||||
self._playwright = await async_playwright().start()
|
||||
self._browser = await self._playwright.chromium.launch(
|
||||
args=["--disable-blink-features=AutomationControlled"],
|
||||
headless=False,
|
||||
)
|
||||
self._context = await self._browser.new_context(
|
||||
viewport={
|
||||
"width": self._screen_size[0],
|
||||
"height": self._screen_size[1],
|
||||
}
|
||||
)
|
||||
self._page = await self._context.new_page()
|
||||
await self._page.goto(self._initial_url)
|
||||
|
||||
termcolor.cprint(
|
||||
f"Started local playwright.",
|
||||
color="green",
|
||||
attrs=["bold"],
|
||||
)
|
||||
|
||||
@override
|
||||
async def environment(self):
|
||||
return ComputerEnvironment.ENVIRONMENT_BROWSER
|
||||
|
||||
@override
|
||||
async def close(self, exc_type, exc_val, exc_tb):
|
||||
if self._context:
|
||||
self._context.close()
|
||||
try:
|
||||
self._browser.close()
|
||||
except Exception as e:
|
||||
# Browser was already shut down because of SIGINT or such.
|
||||
if (
|
||||
"Browser.close: Connection closed while reading from the driver"
|
||||
in str(e)
|
||||
):
|
||||
pass
|
||||
else:
|
||||
raise
|
||||
|
||||
self._playwright.stop()
|
||||
|
||||
async def open_web_browser(self) -> ComputerState:
|
||||
return await self.current_state()
|
||||
|
||||
async def click_at(self, x: int, y: int):
|
||||
await self.highlight_mouse(x, y)
|
||||
await self._page.mouse.click(x, y)
|
||||
await self._page.wait_for_load_state()
|
||||
return await self.current_state()
|
||||
|
||||
async def hover_at(self, x: int, y: int):
|
||||
await self.highlight_mouse(x, y)
|
||||
await self._page.mouse.move(x, y)
|
||||
await self._page.wait_for_load_state()
|
||||
return await self.current_state()
|
||||
|
||||
async def type_text_at(
|
||||
self,
|
||||
x: int,
|
||||
y: int,
|
||||
text: str,
|
||||
press_enter: bool = True,
|
||||
clear_before_typing: bool = True,
|
||||
) -> ComputerState:
|
||||
await self.highlight_mouse(x, y)
|
||||
await self._page.mouse.click(x, y)
|
||||
await self._page.wait_for_load_state()
|
||||
|
||||
if clear_before_typing:
|
||||
await self.key_combination(["Control", "A"])
|
||||
await self.key_combination(["Delete"])
|
||||
|
||||
await self._page.keyboard.type(text)
|
||||
await self._page.wait_for_load_state()
|
||||
|
||||
if press_enter:
|
||||
await self.key_combination(["Enter"])
|
||||
await self._page.wait_for_load_state()
|
||||
return await self.current_state()
|
||||
|
||||
async def _horizontal_document_scroll(
|
||||
self, direction: Literal["left", "right"]
|
||||
) -> ComputerState:
|
||||
# Scroll by 50% of the viewport size.
|
||||
horizontal_scroll_amount = await self.screen_size()[0] // 2
|
||||
if direction == "left":
|
||||
sign = "-"
|
||||
else:
|
||||
sign = ""
|
||||
scroll_argument = f"{sign}{horizontal_scroll_amount}"
|
||||
# Scroll using JS.
|
||||
await self._page.evaluate(f"window.scrollBy({scroll_argument}, 0); ")
|
||||
await self._page.wait_for_load_state()
|
||||
return await self.current_state()
|
||||
|
||||
async def scroll_document(
|
||||
self, direction: Literal["up", "down", "left", "right"]
|
||||
) -> ComputerState:
|
||||
if direction == "down":
|
||||
return await self.key_combination(["PageDown"])
|
||||
elif direction == "up":
|
||||
return await self.key_combination(["PageUp"])
|
||||
elif direction in ("left", "right"):
|
||||
return await self._horizontal_document_scroll(direction)
|
||||
else:
|
||||
raise ValueError("Unsupported direction: ", direction)
|
||||
|
||||
async def scroll_at(
|
||||
self,
|
||||
x: int,
|
||||
y: int,
|
||||
direction: Literal["up", "down", "left", "right"],
|
||||
magnitude: int,
|
||||
) -> ComputerState:
|
||||
await self.highlight_mouse(x, y)
|
||||
|
||||
await self._page.mouse.move(x, y)
|
||||
await self._page.wait_for_load_state()
|
||||
|
||||
dx = 0
|
||||
dy = 0
|
||||
if direction == "up":
|
||||
dy = -magnitude
|
||||
elif direction == "down":
|
||||
dy = magnitude
|
||||
elif direction == "left":
|
||||
dx = -magnitude
|
||||
elif direction == "right":
|
||||
dx = magnitude
|
||||
else:
|
||||
raise ValueError("Unsupported direction: ", direction)
|
||||
|
||||
await self._page.mouse.wheel(dx, dy)
|
||||
await self._page.wait_for_load_state()
|
||||
return await self.current_state()
|
||||
|
||||
async def wait(self, seconds: int) -> ComputerState:
|
||||
await asyncio.sleep(seconds)
|
||||
return await self.current_state()
|
||||
|
||||
async def go_back(self) -> ComputerState:
|
||||
await self._page.go_back()
|
||||
await self._page.wait_for_load_state()
|
||||
return await self.current_state()
|
||||
|
||||
async def go_forward(self) -> ComputerState:
|
||||
await self._page.go_forward()
|
||||
await self._page.wait_for_load_state()
|
||||
return await self.current_state()
|
||||
|
||||
async def search(self) -> ComputerState:
|
||||
return await self.navigate(self._search_engine_url)
|
||||
|
||||
async def navigate(self, url: str) -> ComputerState:
|
||||
await self._page.goto(url)
|
||||
await self._page.wait_for_load_state()
|
||||
return await self.current_state()
|
||||
|
||||
async def key_combination(self, keys: list[str]) -> ComputerState:
|
||||
# Normalize all keys to the Playwright compatible version.
|
||||
keys = [PLAYWRIGHT_KEY_MAP.get(k.lower(), k) for k in keys]
|
||||
|
||||
for key in keys[:-1]:
|
||||
await self._page.keyboard.down(key)
|
||||
|
||||
await self._page.keyboard.press(keys[-1])
|
||||
|
||||
for key in reversed(keys[:-1]):
|
||||
await self._page.keyboard.up(key)
|
||||
|
||||
return await self.current_state()
|
||||
|
||||
async def drag_and_drop(
|
||||
self, x: int, y: int, destination_x: int, destination_y: int
|
||||
) -> ComputerState:
|
||||
await self.highlight_mouse(x, y)
|
||||
await self._page.mouse.move(x, y)
|
||||
await self._page.wait_for_load_state()
|
||||
await self._page.mouse.down()
|
||||
await self._page.wait_for_load_state()
|
||||
|
||||
await self.highlight_mouse(destination_x, destination_y)
|
||||
await self._page.mouse.move(destination_x, destination_y)
|
||||
await self._page.wait_for_load_state()
|
||||
await self._page.mouse.up()
|
||||
return await self.current_state()
|
||||
|
||||
async def current_state(self) -> ComputerState:
|
||||
await self._page.wait_for_load_state()
|
||||
# Even if Playwright reports the page as loaded, it may not be so.
|
||||
# Add a manual sleep to make sure the page has finished rendering.
|
||||
time.sleep(0.5)
|
||||
screenshot_bytes = await self._page.screenshot(type="png", full_page=False)
|
||||
return ComputerState(screenshot=screenshot_bytes, url=self._page.url)
|
||||
|
||||
async def screen_size(self) -> tuple[int, int]:
|
||||
return self._screen_size
|
||||
|
||||
async def highlight_mouse(self, x: int, y: int):
|
||||
if not self._highlight_mouse:
|
||||
return
|
||||
await self._page.evaluate(f"""
|
||||
() => {{
|
||||
const element_id = "playwright-feedback-circle";
|
||||
const div = document.createElement('div');
|
||||
div.id = element_id;
|
||||
div.style.pointerEvents = 'none';
|
||||
div.style.border = '4px solid red';
|
||||
div.style.borderRadius = '50%';
|
||||
div.style.width = '20px';
|
||||
div.style.height = '20px';
|
||||
div.style.position = 'fixed';
|
||||
div.style.zIndex = '9999';
|
||||
document.body.appendChild(div);
|
||||
|
||||
div.hidden = false;
|
||||
div.style.left = {x} - 10 + 'px';
|
||||
div.style.top = {y} - 10 + 'px';
|
||||
|
||||
setTimeout(() => {{
|
||||
div.hidden = true;
|
||||
}}, 2000);
|
||||
}}
|
||||
""")
|
||||
# Wait a bit for the user to see the cursor.
|
||||
time.sleep(1)
|
||||
@@ -0,0 +1,4 @@
|
||||
termcolor==3.1.0
|
||||
playwright==1.52.0
|
||||
browserbase==1.3.0
|
||||
rich
|
||||
Reference in New Issue
Block a user