diff --git a/contributing/samples/computer_use/README.md b/contributing/samples/computer_use/README.md new file mode 100644 index 00000000..ff7ae6c0 --- /dev/null +++ b/contributing/samples/computer_use/README.md @@ -0,0 +1,96 @@ +# Computer Use Agent + +This directory contains a computer use agent that can operate a browser to complete user tasks. The agent uses Playwright to control a Chromium browser and can interact with web pages by taking screenshots, clicking, typing, and navigating. + +This agent is to demo the usage of ComputerUseToolset. + + +## Overview + +The computer use agent consists of: +- `agent.py`: Main agent configuration using Google's gemini-2.5-computer-use-preview-10-2025 model +- `playwright.py`: Playwright-based computer implementation for browser automation +- `requirements.txt`: Python dependencies + +## Setup + +### 1. Install Python Dependencies + +Install the required Python packages from the requirements file: + +```bash +uv pip install -r internal/samples/computer_use/requirements.txt +``` + +### 2. Install Playwright Dependencies + +Install Playwright's system dependencies for Chromium: + +```bash +playwright install-deps chromium +``` + +### 3. Install Chromium Browser + +Install the Chromium browser for Playwright: + +```bash +playwright install chromium +``` + +## Usage + +### Running the Agent + +To start the computer use agent, run the following command from the project root: + +```bash +adk web internal/samples +``` + +This will start the ADK web interface where you can interact with the computer_use agent. + +### Example Queries + +Once the agent is running, you can send queries like: + +``` +find me a flight from SF to Hawaii on next Monday, coming back on next Friday. start by navigating directly to flights.google.com +``` + +The agent will: +1. Open a browser window +2. Navigate to the specified website +3. Interact with the page elements to complete your task +4. Provide updates on its progress + +### Other Example Tasks + +- Book hotel reservations +- Search for products online +- Fill out forms +- Navigate complex websites +- Research information across multiple pages + +## Technical Details + +- **Model**: Uses Google's `gemini-2.5-computer-use-preview-10-2025` model for computer use capabilities +- **Browser**: Automated Chromium browser via Playwright +- **Screen Size**: Configured for 600x800 resolution +- **Tools**: Uses ComputerUseToolset for screen capture, clicking, typing, and scrolling + +## Troubleshooting + +If you encounter issues: + +1. **Playwright not found**: Make sure you've run both `playwright install-deps chromium` and `playwright install chromium` +2. **Dependencies missing**: Verify all packages from `requirements.txt` are installed +3. **Browser crashes**: Check that your system supports Chromium and has sufficient resources +4. **Permission errors**: Ensure your user has permission to run browser automation tools + +## Notes + +- The agent operates in a controlled browser environment +- Screenshots are taken to help the agent understand the current state +- The agent will provide updates on its actions as it works +- Be patient as complex tasks may take some time to complete diff --git a/contributing/samples/computer_use/agent.py b/contributing/samples/computer_use/agent.py new file mode 100755 index 00000000..5fed9aa2 --- /dev/null +++ b/contributing/samples/computer_use/agent.py @@ -0,0 +1,35 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from google.adk import Agent +from google.adk.models.google_llm import Gemini +from google.adk.tools.computer_use.computer_use_toolset import ComputerUseToolset +from typing_extensions import override + +from .playwright import PlaywrightComputer + +root_agent = Agent( + model='gemini-2.5-computer-use-preview-10-2025', + name='hello_world_agent', + description=( + 'computer use agent that can operate a browser on a computer to finish' + ' user tasks' + ), + instruction=""" + you are a computer use agent + """, + tools=[ + ComputerUseToolset(computer=PlaywrightComputer(screen_size=(1280, 936))) + ], +) diff --git a/contributing/samples/computer_use/playwright.py b/contributing/samples/computer_use/playwright.py new file mode 100644 index 00000000..64ad54fd --- /dev/null +++ b/contributing/samples/computer_use/playwright.py @@ -0,0 +1,317 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import asyncio +import time +from typing import Literal + +from google.adk.tools.computer_use.base_computer import BaseComputer +from google.adk.tools.computer_use.base_computer import ComputerEnvironment +from google.adk.tools.computer_use.base_computer import ComputerState +from playwright.async_api import async_playwright +import termcolor +from typing_extensions import override + +# Define a mapping from the user-friendly key names to Playwright's expected key names. +# Playwright is generally good with case-insensitivity for these, but it's best to be canonical. +# See: https://playwright.dev/docs/api/class-keyboard#keyboard-press +# Keys like 'a', 'b', '1', '$' are passed directly. +PLAYWRIGHT_KEY_MAP = { + "backspace": "Backspace", + "tab": "Tab", + "return": "Enter", # Playwright uses 'Enter' + "enter": "Enter", + "shift": "Shift", + "control": "Control", # Or 'ControlOrMeta' for cross-platform Ctrl/Cmd + "alt": "Alt", + "escape": "Escape", + "space": "Space", # Can also just be " " + "pageup": "PageUp", + "pagedown": "PageDown", + "end": "End", + "home": "Home", + "left": "ArrowLeft", + "up": "ArrowUp", + "right": "ArrowRight", + "down": "ArrowDown", + "insert": "Insert", + "delete": "Delete", + "semicolon": ";", # For actual character ';' + "equals": "=", # For actual character '=' + "multiply": "Multiply", # NumpadMultiply + "add": "Add", # NumpadAdd + "separator": "Separator", # Numpad specific + "subtract": "Subtract", # NumpadSubtract, or just '-' for character + "decimal": "Decimal", # NumpadDecimal, or just '.' for character + "divide": "Divide", # NumpadDivide, or just '/' for character + "f1": "F1", + "f2": "F2", + "f3": "F3", + "f4": "F4", + "f5": "F5", + "f6": "F6", + "f7": "F7", + "f8": "F8", + "f9": "F9", + "f10": "F10", + "f11": "F11", + "f12": "F12", + "command": "Meta", # 'Meta' is Command on macOS, Windows key on Windows +} + + +class PlaywrightComputer(BaseComputer): + """Conputer that controls Chromium via Playwright.""" + + def __init__( + self, + screen_size: tuple[int, int], + initial_url: str = "https://www.google.com", + search_engine_url: str = "https://www.google.com", + highlight_mouse: bool = False, + ): + self._initial_url = initial_url + self._screen_size = screen_size + self._search_engine_url = search_engine_url + self._highlight_mouse = highlight_mouse + + @override + async def initialize(self): + print("Creating session...") + self._playwright = await async_playwright().start() + self._browser = await self._playwright.chromium.launch( + args=["--disable-blink-features=AutomationControlled"], + headless=False, + ) + self._context = await self._browser.new_context( + viewport={ + "width": self._screen_size[0], + "height": self._screen_size[1], + } + ) + self._page = await self._context.new_page() + await self._page.goto(self._initial_url) + + termcolor.cprint( + f"Started local playwright.", + color="green", + attrs=["bold"], + ) + + @override + async def environment(self): + return ComputerEnvironment.ENVIRONMENT_BROWSER + + @override + async def close(self, exc_type, exc_val, exc_tb): + if self._context: + self._context.close() + try: + self._browser.close() + except Exception as e: + # Browser was already shut down because of SIGINT or such. + if ( + "Browser.close: Connection closed while reading from the driver" + in str(e) + ): + pass + else: + raise + + self._playwright.stop() + + async def open_web_browser(self) -> ComputerState: + return await self.current_state() + + async def click_at(self, x: int, y: int): + await self.highlight_mouse(x, y) + await self._page.mouse.click(x, y) + await self._page.wait_for_load_state() + return await self.current_state() + + async def hover_at(self, x: int, y: int): + await self.highlight_mouse(x, y) + await self._page.mouse.move(x, y) + await self._page.wait_for_load_state() + return await self.current_state() + + async def type_text_at( + self, + x: int, + y: int, + text: str, + press_enter: bool = True, + clear_before_typing: bool = True, + ) -> ComputerState: + await self.highlight_mouse(x, y) + await self._page.mouse.click(x, y) + await self._page.wait_for_load_state() + + if clear_before_typing: + await self.key_combination(["Control", "A"]) + await self.key_combination(["Delete"]) + + await self._page.keyboard.type(text) + await self._page.wait_for_load_state() + + if press_enter: + await self.key_combination(["Enter"]) + await self._page.wait_for_load_state() + return await self.current_state() + + async def _horizontal_document_scroll( + self, direction: Literal["left", "right"] + ) -> ComputerState: + # Scroll by 50% of the viewport size. + horizontal_scroll_amount = await self.screen_size()[0] // 2 + if direction == "left": + sign = "-" + else: + sign = "" + scroll_argument = f"{sign}{horizontal_scroll_amount}" + # Scroll using JS. + await self._page.evaluate(f"window.scrollBy({scroll_argument}, 0); ") + await self._page.wait_for_load_state() + return await self.current_state() + + async def scroll_document( + self, direction: Literal["up", "down", "left", "right"] + ) -> ComputerState: + if direction == "down": + return await self.key_combination(["PageDown"]) + elif direction == "up": + return await self.key_combination(["PageUp"]) + elif direction in ("left", "right"): + return await self._horizontal_document_scroll(direction) + else: + raise ValueError("Unsupported direction: ", direction) + + async def scroll_at( + self, + x: int, + y: int, + direction: Literal["up", "down", "left", "right"], + magnitude: int, + ) -> ComputerState: + await self.highlight_mouse(x, y) + + await self._page.mouse.move(x, y) + await self._page.wait_for_load_state() + + dx = 0 + dy = 0 + if direction == "up": + dy = -magnitude + elif direction == "down": + dy = magnitude + elif direction == "left": + dx = -magnitude + elif direction == "right": + dx = magnitude + else: + raise ValueError("Unsupported direction: ", direction) + + await self._page.mouse.wheel(dx, dy) + await self._page.wait_for_load_state() + return await self.current_state() + + async def wait(self, seconds: int) -> ComputerState: + await asyncio.sleep(seconds) + return await self.current_state() + + async def go_back(self) -> ComputerState: + await self._page.go_back() + await self._page.wait_for_load_state() + return await self.current_state() + + async def go_forward(self) -> ComputerState: + await self._page.go_forward() + await self._page.wait_for_load_state() + return await self.current_state() + + async def search(self) -> ComputerState: + return await self.navigate(self._search_engine_url) + + async def navigate(self, url: str) -> ComputerState: + await self._page.goto(url) + await self._page.wait_for_load_state() + return await self.current_state() + + async def key_combination(self, keys: list[str]) -> ComputerState: + # Normalize all keys to the Playwright compatible version. + keys = [PLAYWRIGHT_KEY_MAP.get(k.lower(), k) for k in keys] + + for key in keys[:-1]: + await self._page.keyboard.down(key) + + await self._page.keyboard.press(keys[-1]) + + for key in reversed(keys[:-1]): + await self._page.keyboard.up(key) + + return await self.current_state() + + async def drag_and_drop( + self, x: int, y: int, destination_x: int, destination_y: int + ) -> ComputerState: + await self.highlight_mouse(x, y) + await self._page.mouse.move(x, y) + await self._page.wait_for_load_state() + await self._page.mouse.down() + await self._page.wait_for_load_state() + + await self.highlight_mouse(destination_x, destination_y) + await self._page.mouse.move(destination_x, destination_y) + await self._page.wait_for_load_state() + await self._page.mouse.up() + return await self.current_state() + + async def current_state(self) -> ComputerState: + await self._page.wait_for_load_state() + # Even if Playwright reports the page as loaded, it may not be so. + # Add a manual sleep to make sure the page has finished rendering. + time.sleep(0.5) + screenshot_bytes = await self._page.screenshot(type="png", full_page=False) + return ComputerState(screenshot=screenshot_bytes, url=self._page.url) + + async def screen_size(self) -> tuple[int, int]: + return self._screen_size + + async def highlight_mouse(self, x: int, y: int): + if not self._highlight_mouse: + return + await self._page.evaluate(f""" + () => {{ + const element_id = "playwright-feedback-circle"; + const div = document.createElement('div'); + div.id = element_id; + div.style.pointerEvents = 'none'; + div.style.border = '4px solid red'; + div.style.borderRadius = '50%'; + div.style.width = '20px'; + div.style.height = '20px'; + div.style.position = 'fixed'; + div.style.zIndex = '9999'; + document.body.appendChild(div); + + div.hidden = false; + div.style.left = {x} - 10 + 'px'; + div.style.top = {y} - 10 + 'px'; + + setTimeout(() => {{ + div.hidden = true; + }}, 2000); + }} + """) + # Wait a bit for the user to see the cursor. + time.sleep(1) diff --git a/contributing/samples/computer_use/requirements.txt b/contributing/samples/computer_use/requirements.txt new file mode 100644 index 00000000..5b1df138 --- /dev/null +++ b/contributing/samples/computer_use/requirements.txt @@ -0,0 +1,4 @@ +termcolor==3.1.0 +playwright==1.52.0 +browserbase==1.3.0 +rich