From ae139bb461c2e7c6be154b04f3f2c80919808d31 Mon Sep 17 00:00:00 2001
From: Ankur Sharma <ankusharma@google.com>
Date: Thu, 9 Oct 2025 20:30:21 -0700
Subject: [PATCH] feat: ADK cli allows developers to create an eval set and add
 an eval case

Agent developers can now create an eval set and add eval cases through command line itself. Adding an eval case is limited only to specifying conversation scenarios.

Sample comamnds:
- Create an eval set:
adk eval_set create \
    contributing/samples/hello_world \
    set_01

- Add an eval case with scenario file
Content of scenarios.json file:
'{"scenarios": [{"starting_prompt": "hello", "conversation_plan": "world"}]}'

adk eval_set add_eval_case \
    contributing/samples/hello_world \
    set_01 \
    --scenarios scenarios.json

PiperOrigin-RevId: 817456117
---
 src/google/adk/cli/cli_eval.py                |  20 ++
 src/google/adk/cli/cli_tools_click.py         | 166 ++++++++++++++++-
 tests/unittests/cli/utils/test_cli_eval.py    |  51 ++++++
 .../cli/utils/test_cli_tools_click.py         | 172 ++++++++++++++++++
 4 files changed, 400 insertions(+), 9 deletions(-)
 create mode 100644 tests/unittests/cli/utils/test_cli_eval.py
diff --git a/src/google/adk/cli/cli_eval.py b/src/google/adk/cli/cli_eval.py
index dcd6652c..4f285d61 100644
--- a/src/google/adk/cli/cli_eval.py
+++ b/src/google/adk/cli/cli_eval.py
@@ -47,6 +47,7 @@ from ..evaluation.eval_metrics import EvalMetricResult
 from ..evaluation.eval_metrics import EvalMetricResultPerInvocation
 from ..evaluation.eval_metrics import JudgeModelOptions
 from ..evaluation.eval_result import EvalCaseResult
+from ..evaluation.eval_sets_manager import EvalSetsManager
 from ..evaluation.evaluator import EvalStatus
 from ..evaluation.evaluator import Evaluator
 from ..sessions.base_session_service import BaseSessionService
@@ -436,3 +437,22 @@ def _get_evaluator(eval_metric: EvalMetric) -> Evaluator:
     return FinalResponseMatchV2Evaluator(eval_metric)
 
   raise ValueError(f"Unsupported eval metric: {eval_metric}")
+
+
+def get_eval_sets_manager(
+    eval_storage_uri: Optional[str], agents_dir: str
+) -> EvalSetsManager:
+  """Returns an instance of EvalSetsManager."""
+  try:
+    from ..evaluation.local_eval_sets_manager import LocalEvalSetsManager
+    from .utils import evals
+  except ModuleNotFoundError as mnf:
+    raise click.ClickException(MISSING_EVAL_DEPENDENCIES_MESSAGE) from mnf
+
+  if eval_storage_uri:
+    gcs_eval_managers = evals.create_gcs_eval_managers_from_uri(
+        eval_storage_uri
+    )
+    return gcs_eval_managers.eval_sets_manager
+  else:
+    return LocalEvalSetsManager(agents_dir=agents_dir)
diff --git a/src/google/adk/cli/cli_tools_click.py b/src/google/adk/cli/cli_tools_click.py
index e76d903f..7115b5fc 100644
--- a/src/google/adk/cli/cli_tools_click.py
+++ b/src/google/adk/cli/cli_tools_click.py
@@ -18,6 +18,8 @@ import asyncio
 from contextlib import asynccontextmanager
 from datetime import datetime
 import functools
+import hashlib
+import json
 import logging
 import os
 from pathlib import Path
@@ -433,6 +435,28 @@ def cli_run(
   )
 
 
+def eval_options():
+  """Decorator to add common eval options to click commands."""
+
+  def decorator(func):
+    @click.option(
+        "--eval_storage_uri",
+        type=str,
+        help=(
+            "Optional. The evals storage URI to store agent evals,"
+            " supported URIs: gs://<bucket name>."
+        ),
+        default=None,
+    )
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+      return func(*args, **kwargs)
+
+    return wrapper
+
+  return decorator
+
+
 @main.command("eval", cls=HelpfulCommand)
 @click.argument(
     "agent_module_file_path",
@@ -449,15 +473,7 @@ def cli_run(
     default=False,
     help="Optional. Whether to print detailed results on console or not.",
 )
-@click.option(
-    "--eval_storage_uri",
-    type=str,
-    help=(
-        "Optional. The evals storage URI to store agent evals,"
-        " supported URIs: gs://<bucket name>."
-    ),
-    default=None,
-)
+@eval_options()
 def cli_eval(
     agent_module_file_path: str,
     eval_set_file_path_or_id: list[str],
@@ -675,6 +691,138 @@ def cli_eval(
       pretty_print_eval_result(eval_result)
 
 
+@main.group("eval_set")
+def eval_set():
+  """Manage Eval Sets."""
+  pass
+
+
+@eval_set.command("create", cls=HelpfulCommand)
+@click.argument(
+    "agent_module_file_path",
+    type=click.Path(
+        exists=True, dir_okay=True, file_okay=False, resolve_path=True
+    ),
+)
+@click.argument("eval_set_id", type=str, required=True)
+@eval_options()
+def cli_create_eval_set(
+    agent_module_file_path: str,
+    eval_set_id: str,
+    eval_storage_uri: Optional[str] = None,
+):
+  """Creates an empty EvalSet given the agent_module_file_path and eval_set_id."""
+  from .cli_eval import get_eval_sets_manager
+
+  app_name = os.path.basename(agent_module_file_path)
+  agents_dir = os.path.dirname(agent_module_file_path)
+  eval_sets_manager = get_eval_sets_manager(eval_storage_uri, agents_dir)
+
+  try:
+    eval_sets_manager.create_eval_set(
+        app_name=app_name, eval_set_id=eval_set_id
+    )
+    click.echo(f"Eval set '{eval_set_id}' created for app '{app_name}'.")
+  except ValueError as e:
+    raise click.ClickException(str(e))
+
+
+@eval_set.command("add_eval_case", cls=HelpfulCommand)
+@click.argument(
+    "agent_module_file_path",
+    type=click.Path(
+        exists=True, dir_okay=True, file_okay=False, resolve_path=True
+    ),
+)
+@click.argument("eval_set_id", type=str, required=True)
+@click.option(
+    "--scenarios_file",
+    type=click.Path(
+        exists=True, dir_okay=False, file_okay=True, resolve_path=True
+    ),
+    help="A path to file containing JSON serialized ConversationScenarios.",
+    required=True,
+)
+@click.option(
+    "--session_input_file",
+    type=click.Path(
+        exists=True, dir_okay=False, file_okay=True, resolve_path=True
+    ),
+    help=(
+        "Optional. Path to session file containing SessionInput in JSON format."
+    ),
+    default=None,
+)
+@eval_options()
+def cli_add_eval_case(
+    agent_module_file_path: str,
+    eval_set_id: str,
+    scenarios_file: str,
+    eval_storage_uri: Optional[str] = None,
+    session_input_file: Optional[str] = None,
+):
+  """Adds eval cases to the given eval set.
+
+  There are several ways that an eval case can be created, for now this method
+  only supports adding one using a conversation scenarios file.
+
+  If an eval case for the generated id already exists, then we skip adding it.
+  """
+  try:
+    from ..evaluation.conversation_scenarios import ConversationScenarios
+    from ..evaluation.eval_case import EvalCase
+    from ..evaluation.eval_case import SessionInput
+    from .cli_eval import get_eval_sets_manager
+  except ModuleNotFoundError as mnf:
+    raise click.ClickException(MISSING_EVAL_DEPENDENCIES_MESSAGE) from mnf
+
+  app_name = os.path.basename(agent_module_file_path)
+  agents_dir = os.path.dirname(agent_module_file_path)
+  eval_sets_manager = get_eval_sets_manager(eval_storage_uri, agents_dir)
+
+  try:
+    session_input = None
+    if session_input_file:
+      with open(session_input_file, "r") as f:
+        session_input = SessionInput.model_validate_json(f.read())
+
+    with open(scenarios_file, "r") as f:
+      conversation_scenarios = ConversationScenarios.model_validate_json(
+          f.read()
+      )
+
+    for scenario in conversation_scenarios.scenarios:
+      scenario_str = json.dumps(scenario.model_dump(), sort_keys=True)
+      eval_id = hashlib.sha256(scenario_str.encode("utf-8")).hexdigest()[:8]
+      eval_case = EvalCase(
+          eval_id=eval_id,
+          conversation_scenario=scenario,
+          session_input=session_input,
+          creation_timestamp=datetime.now().timestamp(),
+      )
+
+      if (
+          eval_sets_manager.get_eval_case(
+              app_name=app_name, eval_set_id=eval_set_id, eval_case_id=eval_id
+          )
+          is None
+      ):
+        eval_sets_manager.add_eval_case(
+            app_name=app_name, eval_set_id=eval_set_id, eval_case=eval_case
+        )
+        click.echo(
+            f"Eval case '{eval_case.eval_id}' added to eval set"
+            f" '{eval_set_id}'."
+        )
+      else:
+        click.echo(
+            f"Eval case '{eval_case.eval_id}' already exists in eval set"
+            f" '{eval_set_id}', skipped adding."
+        )
+  except Exception as e:
+    raise click.ClickException(f"Failed to add eval case(s): {e}") from e
+
+
 def web_options():
   """Decorator to add web UI options to click commands."""
 
diff --git a/tests/unittests/cli/utils/test_cli_eval.py b/tests/unittests/cli/utils/test_cli_eval.py
new file mode 100644
index 00000000..8ff33dd9
--- /dev/null
+++ b/tests/unittests/cli/utils/test_cli_eval.py
@@ -0,0 +1,51 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for utilities in cli_eval."""
+
+from __future__ import annotations
+
+from types import SimpleNamespace
+from unittest import mock
+
+
+def test_get_eval_sets_manager_local(monkeypatch):
+  mock_local_manager = mock.MagicMock()
+  monkeypatch.setattr(
+      "google.adk.evaluation.local_eval_sets_manager.LocalEvalSetsManager",
+      lambda *a, **k: mock_local_manager,
+  )
+  from google.adk.cli.cli_eval import get_eval_sets_manager
+
+  manager = get_eval_sets_manager(eval_storage_uri=None, agents_dir="some/dir")
+  assert manager == mock_local_manager
+
+
+def test_get_eval_sets_manager_gcs(monkeypatch):
+  mock_gcs_manager = mock.MagicMock()
+  mock_create_gcs = mock.MagicMock()
+  mock_create_gcs.return_value = SimpleNamespace(
+      eval_sets_manager=mock_gcs_manager
+  )
+  monkeypatch.setattr(
+      "google.adk.cli.utils.evals.create_gcs_eval_managers_from_uri",
+      mock_create_gcs,
+  )
+  from google.adk.cli.cli_eval import get_eval_sets_manager
+
+  manager = get_eval_sets_manager(
+      eval_storage_uri="gs://bucket", agents_dir="some/dir"
+  )
+  assert manager == mock_gcs_manager
+  mock_create_gcs.assert_called_once_with("gs://bucket")
diff --git a/tests/unittests/cli/utils/test_cli_tools_click.py b/tests/unittests/cli/utils/test_cli_tools_click.py
index 138289ed..3bd02b39 100644
--- a/tests/unittests/cli/utils/test_cli_tools_click.py
+++ b/tests/unittests/cli/utils/test_cli_tools_click.py
@@ -18,6 +18,7 @@
 from __future__ import annotations
 
 import builtins
+import json
 from pathlib import Path
 from types import SimpleNamespace
 from typing import Any
@@ -620,6 +621,177 @@ def test_cli_eval_with_eval_set_id(
   assert len(eval_set_results) == 2
 
 
+def test_cli_create_eval_set(tmp_path: Path):
+  app_name = "test_app"
+  eval_set_id = "test_eval_set"
+  agent_path = tmp_path / app_name
+  agent_path.mkdir()
+  (agent_path / "__init__.py").touch()
+
+  runner = CliRunner()
+  result = runner.invoke(
+      cli_tools_click.main,
+      ["eval_set", "create", str(agent_path), eval_set_id],
+  )
+
+  assert result.exit_code == 0
+  eval_set_file = agent_path / f"{eval_set_id}.evalset.json"
+  assert eval_set_file.exists()
+  with open(eval_set_file, "r") as f:
+    eval_set_data = json.load(f)
+  assert eval_set_data["eval_set_id"] == eval_set_id
+  assert eval_set_data["eval_cases"] == []
+
+
+def test_cli_add_eval_case_no_session(tmp_path: Path):
+  app_name = "test_app_add_1"
+  eval_set_id = "test_eval_set_add_1"
+  agent_path = tmp_path / app_name
+  agent_path.mkdir()
+  (agent_path / "__init__.py").touch()
+
+  scenarios_file = tmp_path / "scenarios1.json"
+  scenarios_file.write_text(
+      '{"scenarios": [{"starting_prompt": "hello", "conversation_plan":'
+      ' "world"}]}'
+  )
+
+  runner = CliRunner()
+  runner.invoke(
+      cli_tools_click.main,
+      ["eval_set", "create", str(agent_path), eval_set_id],
+      catch_exceptions=False,
+  )
+  result = runner.invoke(
+      cli_tools_click.main,
+      [
+          "eval_set",
+          "add_eval_case",
+          str(agent_path),
+          eval_set_id,
+          "--scenarios_file",
+          str(scenarios_file),
+      ],
+      catch_exceptions=False,
+  )
+
+  assert result.exit_code == 0
+  eval_set_file = agent_path / f"{eval_set_id}.evalset.json"
+  assert eval_set_file.exists()
+  with open(eval_set_file, "r") as f:
+    eval_set_data = json.load(f)
+  assert len(eval_set_data["eval_cases"]) == 1
+  eval_case = eval_set_data["eval_cases"][0]
+  assert eval_case["eval_id"] == "0a1a5048"
+  assert eval_case["conversation_scenario"]["starting_prompt"] == "hello"
+  assert "session_input" not in eval_case
+
+
+def test_cli_add_eval_case_with_session(tmp_path: Path):
+  app_name = "test_app_add_2"
+  eval_set_id = "test_eval_set_add_2"
+  agent_path = tmp_path / app_name
+  agent_path.mkdir()
+  (agent_path / "__init__.py").touch()
+
+  scenarios_file = tmp_path / "scenarios2.json"
+  scenarios_file.write_text(
+      '{"scenarios": [{"starting_prompt": "hello", "conversation_plan":'
+      ' "world"}]}'
+  )
+  session_file = tmp_path / "session2.json"
+  session_file.write_text(
+      '{"app_name": "test_app_add_2", "user_id": "test_user", "state": {}}'
+  )
+
+  runner = CliRunner()
+  runner.invoke(
+      cli_tools_click.main,
+      ["eval_set", "create", str(agent_path), eval_set_id],
+      catch_exceptions=False,
+  )
+  result = runner.invoke(
+      cli_tools_click.main,
+      [
+          "eval_set",
+          "add_eval_case",
+          str(agent_path),
+          eval_set_id,
+          "--scenarios_file",
+          str(scenarios_file),
+          "--session_input_file",
+          str(session_file),
+      ],
+      catch_exceptions=False,
+  )
+
+  assert result.exit_code == 0
+  eval_set_file = agent_path / f"{eval_set_id}.evalset.json"
+  assert eval_set_file.exists()
+  with open(eval_set_file, "r") as f:
+    eval_set_data = json.load(f)
+  assert len(eval_set_data["eval_cases"]) == 1
+  eval_case = eval_set_data["eval_cases"][0]
+  assert eval_case["eval_id"] == "0a1a5048"
+  assert eval_case["session_input"]["app_name"] == "test_app_add_2"
+
+
+def test_cli_add_eval_case_skip_existing(tmp_path: Path):
+  app_name = "test_app_add_3"
+  eval_set_id = "test_eval_set_add_3"
+  agent_path = tmp_path / app_name
+  agent_path.mkdir()
+  (agent_path / "__init__.py").touch()
+
+  scenarios_file = tmp_path / "scenarios3.json"
+  scenarios_file.write_text(
+      '{"scenarios": [{"starting_prompt": "hello", "conversation_plan":'
+      ' "world"}]}'
+  )
+
+  runner = CliRunner()
+  runner.invoke(
+      cli_tools_click.main,
+      ["eval_set", "create", str(agent_path), eval_set_id],
+      catch_exceptions=False,
+  )
+  result1 = runner.invoke(
+      cli_tools_click.main,
+      [
+          "eval_set",
+          "add_eval_case",
+          str(agent_path),
+          eval_set_id,
+          "--scenarios_file",
+          str(scenarios_file),
+      ],
+      catch_exceptions=False,
+  )
+  eval_set_file = agent_path / f"{eval_set_id}.evalset.json"
+  with open(eval_set_file, "r") as f:
+    eval_set_data1 = json.load(f)
+
+  result2 = runner.invoke(
+      cli_tools_click.main,
+      [
+          "eval_set",
+          "add_eval_case",
+          str(agent_path),
+          eval_set_id,
+          "--scenarios_file",
+          str(scenarios_file),
+      ],
+      catch_exceptions=False,
+  )
+  with open(eval_set_file, "r") as f:
+    eval_set_data2 = json.load(f)
+
+  assert result1.exit_code == 0
+  assert result2.exit_code == 0
+  assert len(eval_set_data1["eval_cases"]) == 1
+  assert len(eval_set_data2["eval_cases"]) == 1
+
+
 def test_cli_deploy_cloud_run_gcloud_arg_conflict(
     tmp_path: Path, monkeypatch: pytest.MonkeyPatch
 ) -> None: