chore: Add computer use sample agent

PiperOrigin-RevId: 820407078
2026-03-30 10:57:20 -07:00 · 2025-10-16 14:57:28 -07:00
parent 37a153ef94
commit 2a8fdd94e1
4 changed files with 452 additions and 0 deletions
@@ -0,0 +1,96 @@
+# Computer Use Agent
+
+This directory contains a computer use agent that can operate a browser to complete user tasks. The agent uses Playwright to control a Chromium browser and can interact with web pages by taking screenshots, clicking, typing, and navigating.
+
+This agent is to demo the usage of ComputerUseToolset.
+
+
+## Overview
+
+The computer use agent consists of:
+- `agent.py`: Main agent configuration using Google's gemini-2.5-computer-use-preview-10-2025 model
+- `playwright.py`: Playwright-based computer implementation for browser automation
+- `requirements.txt`: Python dependencies
+
+## Setup
+
+### 1. Install Python Dependencies
+
+Install the required Python packages from the requirements file:
+
+```bash
+uv pip install -r internal/samples/computer_use/requirements.txt
+```
+
+### 2. Install Playwright Dependencies
+
+Install Playwright's system dependencies for Chromium:
+
+```bash
+playwright install-deps chromium
+```
+
+### 3. Install Chromium Browser
+
+Install the Chromium browser for Playwright:
+
+```bash
+playwright install chromium
+```
+
+## Usage
+
+### Running the Agent
+
+To start the computer use agent, run the following command from the project root:
+
+```bash
+adk web internal/samples
+```
+
+This will start the ADK web interface where you can interact with the computer_use agent.
+
+### Example Queries
+
+Once the agent is running, you can send queries like:
+
+```
+find me a flight from SF to Hawaii on next Monday, coming back on next Friday. start by navigating directly to flights.google.com
+```
+
+The agent will:
+1. Open a browser window
+2. Navigate to the specified website
+3. Interact with the page elements to complete your task
+4. Provide updates on its progress
+
+### Other Example Tasks
+
+- Book hotel reservations
+- Search for products online
+- Fill out forms
+- Navigate complex websites
+- Research information across multiple pages
+
+## Technical Details
+
+- **Model**: Uses Google's `gemini-2.5-computer-use-preview-10-2025` model for computer use capabilities
+- **Browser**: Automated Chromium browser via Playwright
+- **Screen Size**: Configured for 600x800 resolution
+- **Tools**: Uses ComputerUseToolset for screen capture, clicking, typing, and scrolling
+
+## Troubleshooting
+
+If you encounter issues:
+
+1. **Playwright not found**: Make sure you've run both `playwright install-deps chromium` and `playwright install chromium`
+2. **Dependencies missing**: Verify all packages from `requirements.txt` are installed
+3. **Browser crashes**: Check that your system supports Chromium and has sufficient resources
+4. **Permission errors**: Ensure your user has permission to run browser automation tools
+
+## Notes
+
+- The agent operates in a controlled browser environment
+- Screenshots are taken to help the agent understand the current state
+- The agent will provide updates on its actions as it works
+- Be patient as complex tasks may take some time to complete
@@ -0,0 +1,35 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from google.adk import Agent
+from google.adk.models.google_llm import Gemini
+from google.adk.tools.computer_use.computer_use_toolset import ComputerUseToolset
+from typing_extensions import override
+
+from .playwright import PlaywrightComputer
+
+root_agent = Agent(
+    model='gemini-2.5-computer-use-preview-10-2025',
+    name='hello_world_agent',
+    description=(
+        'computer use agent that can operate a browser on a computer to finish'
+        ' user tasks'
+    ),
+    instruction="""
+      you are a computer use agent
+      """,
+    tools=[
+        ComputerUseToolset(computer=PlaywrightComputer(screen_size=(1280, 936)))
+    ],
+)
@@ -0,0 +1,317 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+import time
+from typing import Literal
+
+from google.adk.tools.computer_use.base_computer import BaseComputer
+from google.adk.tools.computer_use.base_computer import ComputerEnvironment
+from google.adk.tools.computer_use.base_computer import ComputerState
+from playwright.async_api import async_playwright
+import termcolor
+from typing_extensions import override
+
+# Define a mapping from the user-friendly key names to Playwright's expected key names.
+# Playwright is generally good with case-insensitivity for these, but it's best to be canonical.
+# See: https://playwright.dev/docs/api/class-keyboard#keyboard-press
+# Keys like 'a', 'b', '1', '$' are passed directly.
+PLAYWRIGHT_KEY_MAP = {
+    "backspace": "Backspace",
+    "tab": "Tab",
+    "return": "Enter",  # Playwright uses 'Enter'
+    "enter": "Enter",
+    "shift": "Shift",
+    "control": "Control",  # Or 'ControlOrMeta' for cross-platform Ctrl/Cmd
+    "alt": "Alt",
+    "escape": "Escape",
+    "space": "Space",  # Can also just be " "
+    "pageup": "PageUp",
+    "pagedown": "PageDown",
+    "end": "End",
+    "home": "Home",
+    "left": "ArrowLeft",
+    "up": "ArrowUp",
+    "right": "ArrowRight",
+    "down": "ArrowDown",
+    "insert": "Insert",
+    "delete": "Delete",
+    "semicolon": ";",  # For actual character ';'
+    "equals": "=",  # For actual character '='
+    "multiply": "Multiply",  # NumpadMultiply
+    "add": "Add",  # NumpadAdd
+    "separator": "Separator",  # Numpad specific
+    "subtract": "Subtract",  # NumpadSubtract, or just '-' for character
+    "decimal": "Decimal",  # NumpadDecimal, or just '.' for character
+    "divide": "Divide",  # NumpadDivide, or just '/' for character
+    "f1": "F1",
+    "f2": "F2",
+    "f3": "F3",
+    "f4": "F4",
+    "f5": "F5",
+    "f6": "F6",
+    "f7": "F7",
+    "f8": "F8",
+    "f9": "F9",
+    "f10": "F10",
+    "f11": "F11",
+    "f12": "F12",
+    "command": "Meta",  # 'Meta' is Command on macOS, Windows key on Windows
+}
+
+
+class PlaywrightComputer(BaseComputer):
+  """Conputer that controls Chromium via Playwright."""
+
+  def __init__(
+      self,
+      screen_size: tuple[int, int],
+      initial_url: str = "https://www.google.com",
+      search_engine_url: str = "https://www.google.com",
+      highlight_mouse: bool = False,
+  ):
+    self._initial_url = initial_url
+    self._screen_size = screen_size
+    self._search_engine_url = search_engine_url
+    self._highlight_mouse = highlight_mouse
+
+  @override
+  async def initialize(self):
+    print("Creating session...")
+    self._playwright = await async_playwright().start()
+    self._browser = await self._playwright.chromium.launch(
+        args=["--disable-blink-features=AutomationControlled"],
+        headless=False,
+    )
+    self._context = await self._browser.new_context(
+        viewport={
+            "width": self._screen_size[0],
+            "height": self._screen_size[1],
+        }
+    )
+    self._page = await self._context.new_page()
+    await self._page.goto(self._initial_url)
+
+    termcolor.cprint(
+        f"Started local playwright.",
+        color="green",
+        attrs=["bold"],
+    )
+
+  @override
+  async def environment(self):
+    return ComputerEnvironment.ENVIRONMENT_BROWSER
+
+  @override
+  async def close(self, exc_type, exc_val, exc_tb):
+    if self._context:
+      self._context.close()
+    try:
+      self._browser.close()
+    except Exception as e:
+      # Browser was already shut down because of SIGINT or such.
+      if (
+          "Browser.close: Connection closed while reading from the driver"
+          in str(e)
+      ):
+        pass
+      else:
+        raise
+
+    self._playwright.stop()
+
+  async def open_web_browser(self) -> ComputerState:
+    return await self.current_state()
+
+  async def click_at(self, x: int, y: int):
+    await self.highlight_mouse(x, y)
+    await self._page.mouse.click(x, y)
+    await self._page.wait_for_load_state()
+    return await self.current_state()
+
+  async def hover_at(self, x: int, y: int):
+    await self.highlight_mouse(x, y)
+    await self._page.mouse.move(x, y)
+    await self._page.wait_for_load_state()
+    return await self.current_state()
+
+  async def type_text_at(
+      self,
+      x: int,
+      y: int,
+      text: str,
+      press_enter: bool = True,
+      clear_before_typing: bool = True,
+  ) -> ComputerState:
+    await self.highlight_mouse(x, y)
+    await self._page.mouse.click(x, y)
+    await self._page.wait_for_load_state()
+
+    if clear_before_typing:
+      await self.key_combination(["Control", "A"])
+      await self.key_combination(["Delete"])
+
+    await self._page.keyboard.type(text)
+    await self._page.wait_for_load_state()
+
+    if press_enter:
+      await self.key_combination(["Enter"])
+    await self._page.wait_for_load_state()
+    return await self.current_state()
+
+  async def _horizontal_document_scroll(
+      self, direction: Literal["left", "right"]
+  ) -> ComputerState:
+    # Scroll by 50% of the viewport size.
+    horizontal_scroll_amount = await self.screen_size()[0] // 2
+    if direction == "left":
+      sign = "-"
+    else:
+      sign = ""
+    scroll_argument = f"{sign}{horizontal_scroll_amount}"
+    # Scroll using JS.
+    await self._page.evaluate(f"window.scrollBy({scroll_argument}, 0); ")
+    await self._page.wait_for_load_state()
+    return await self.current_state()
+
+  async def scroll_document(
+      self, direction: Literal["up", "down", "left", "right"]
+  ) -> ComputerState:
+    if direction == "down":
+      return await self.key_combination(["PageDown"])
+    elif direction == "up":
+      return await self.key_combination(["PageUp"])
+    elif direction in ("left", "right"):
+      return await self._horizontal_document_scroll(direction)
+    else:
+      raise ValueError("Unsupported direction: ", direction)
+
+  async def scroll_at(
+      self,
+      x: int,
+      y: int,
+      direction: Literal["up", "down", "left", "right"],
+      magnitude: int,
+  ) -> ComputerState:
+    await self.highlight_mouse(x, y)
+
+    await self._page.mouse.move(x, y)
+    await self._page.wait_for_load_state()
+
+    dx = 0
+    dy = 0
+    if direction == "up":
+      dy = -magnitude
+    elif direction == "down":
+      dy = magnitude
+    elif direction == "left":
+      dx = -magnitude
+    elif direction == "right":
+      dx = magnitude
+    else:
+      raise ValueError("Unsupported direction: ", direction)
+
+    await self._page.mouse.wheel(dx, dy)
+    await self._page.wait_for_load_state()
+    return await self.current_state()
+
+  async def wait(self, seconds: int) -> ComputerState:
+    await asyncio.sleep(seconds)
+    return await self.current_state()
+
+  async def go_back(self) -> ComputerState:
+    await self._page.go_back()
+    await self._page.wait_for_load_state()
+    return await self.current_state()
+
+  async def go_forward(self) -> ComputerState:
+    await self._page.go_forward()
+    await self._page.wait_for_load_state()
+    return await self.current_state()
+
+  async def search(self) -> ComputerState:
+    return await self.navigate(self._search_engine_url)
+
+  async def navigate(self, url: str) -> ComputerState:
+    await self._page.goto(url)
+    await self._page.wait_for_load_state()
+    return await self.current_state()
+
+  async def key_combination(self, keys: list[str]) -> ComputerState:
+    # Normalize all keys to the Playwright compatible version.
+    keys = [PLAYWRIGHT_KEY_MAP.get(k.lower(), k) for k in keys]
+
+    for key in keys[:-1]:
+      await self._page.keyboard.down(key)
+
+    await self._page.keyboard.press(keys[-1])
+
+    for key in reversed(keys[:-1]):
+      await self._page.keyboard.up(key)
+
+    return await self.current_state()
+
+  async def drag_and_drop(
+      self, x: int, y: int, destination_x: int, destination_y: int
+  ) -> ComputerState:
+    await self.highlight_mouse(x, y)
+    await self._page.mouse.move(x, y)
+    await self._page.wait_for_load_state()
+    await self._page.mouse.down()
+    await self._page.wait_for_load_state()
+
+    await self.highlight_mouse(destination_x, destination_y)
+    await self._page.mouse.move(destination_x, destination_y)
+    await self._page.wait_for_load_state()
+    await self._page.mouse.up()
+    return await self.current_state()
+
+  async def current_state(self) -> ComputerState:
+    await self._page.wait_for_load_state()
+    # Even if Playwright reports the page as loaded, it may not be so.
+    # Add a manual sleep to make sure the page has finished rendering.
+    time.sleep(0.5)
+    screenshot_bytes = await self._page.screenshot(type="png", full_page=False)
+    return ComputerState(screenshot=screenshot_bytes, url=self._page.url)
+
+  async def screen_size(self) -> tuple[int, int]:
+    return self._screen_size
+
+  async def highlight_mouse(self, x: int, y: int):
+    if not self._highlight_mouse:
+      return
+    await self._page.evaluate(f"""
+        () => {{
+            const element_id = "playwright-feedback-circle";
+            const div = document.createElement('div');
+            div.id = element_id;
+            div.style.pointerEvents = 'none';
+            div.style.border = '4px solid red';
+            div.style.borderRadius = '50%';
+            div.style.width = '20px';
+            div.style.height = '20px';
+            div.style.position = 'fixed';
+            div.style.zIndex = '9999';
+            document.body.appendChild(div);
+
+            div.hidden = false;
+            div.style.left = {x} - 10 + 'px';
+            div.style.top = {y} - 10 + 'px';
+
+            setTimeout(() => {{
+                div.hidden = true;
+            }}, 2000);
+        }}
+    """)
+    # Wait a bit for the user to see the cursor.
+    time.sleep(1)
@@ -0,0 +1,4 @@
+termcolor==3.1.0
+playwright==1.52.0
+browserbase==1.3.0
+rich