fix: Adding detailed information on each metric evaluation

Additionally, few other small changes. * Updated a test fixture to support the latest eval data schema. Somehow I missed doing that previously. * Updated the `evaluation_generator.py` to use `run_async`, instead of `run`. * Also, raise an informed error when dependencies required eval are not installed. * Also, changed the behavior of AgentEvaluator.evaluate method to run all the evals, instead of failing at the first eval metric failure. PiperOrigin-RevId: 775919127
2026-03-30 10:57:20 -07:00 · 2025-06-25 18:31:25 -07:00
parent 3901fade71
commit 04de3e197d
7 changed files with 240 additions and 49 deletions
@@ -26,6 +26,7 @@ import uuid

 from ..agents import Agent
 from ..artifacts.base_artifact_service import BaseArtifactService
+from ..evaluation.constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
 from ..evaluation.eval_case import EvalCase
 from ..evaluation.eval_metrics import EvalMetric
 from ..evaluation.eval_metrics import EvalMetricResult
@@ -38,10 +39,6 @@ from ..sessions.base_session_service import BaseSessionService
 logger = logging.getLogger("google_adk." + __name__)


-MISSING_EVAL_DEPENDENCIES_MESSAGE = (
-    "Eval module is not installed, please install via `pip install"
-    " google-adk[eval]`."
-)
 TOOL_TRAJECTORY_SCORE_KEY = "tool_trajectory_avg_score"
 RESPONSE_MATCH_SCORE_KEY = "response_match_score"
 # This evaluation is not very stable.
@@ -150,7 +147,7 @@ async def run_evals(
    artifact_service: The artifact service to use during inferencing.
  """
  try:
-    from ..evaluation.agent_evaluator import EvaluationGenerator
+    from ..evaluation.evaluation_generator import EvaluationGenerator
  except ModuleNotFoundError as e:
    raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e

@@ -252,7 +249,8 @@ async def run_evals(
          result = "❌ Failed"

        print(f"Result: {result}\n")
-
+      except ModuleNotFoundError as e:
+        raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
      except Exception:
        # Catching the general exception, so that we don't block other eval
        # cases.
@@ -31,12 +31,12 @@ import uvicorn
 from . import cli_create
 from . import cli_deploy
 from .. import version
+from ..evaluation.constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
 from ..evaluation.gcs_eval_set_results_manager import GcsEvalSetResultsManager
 from ..evaluation.gcs_eval_sets_manager import GcsEvalSetsManager
 from ..evaluation.local_eval_set_results_manager import LocalEvalSetResultsManager
 from ..sessions.in_memory_session_service import InMemorySessionService
 from .cli import run_cli
-from .cli_eval import MISSING_EVAL_DEPENDENCIES_MESSAGE
 from .fast_api import get_fast_api_app
 from .utils import envs
 from .utils import evals
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from __future__ import annotations
+
 import json
 import logging
 import os
@@ -23,16 +25,16 @@ from typing import Optional
 from typing import Union
 import uuid

+from google.genai import types as genai_types
 from pydantic import ValidationError

+from .constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
+from .eval_case import IntermediateData
 from .eval_set import EvalSet
-from .evaluation_generator import EvaluationGenerator
 from .evaluator import EvalStatus
 from .evaluator import EvaluationResult
 from .evaluator import Evaluator
 from .local_eval_sets_manager import convert_eval_set_to_pydanctic_schema
-from .response_evaluator import ResponseEvaluator
-from .trajectory_evaluator import TrajectoryEvaluator

 logger = logging.getLogger("google_adk." + __name__)

@@ -96,6 +98,7 @@ class AgentEvaluator:
      criteria: dict[str, float],
      num_runs=NUM_RUNS,
      agent_name=None,
+      print_detailed_results: bool = True,
  ):
    """Evaluates an agent using the given EvalSet.

@@ -109,7 +112,13 @@ class AgentEvaluator:
      num_runs: Number of times all entries in the eval dataset should be
        assessed.
      agent_name: The name of the agent.
+      print_detailed_results: Whether to print detailed results for each metric
+        evaluation.
    """
+    try:
+      from .evaluation_generator import EvaluationGenerator
+    except ModuleNotFoundError as e:
+      raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
    eval_case_responses_list = await EvaluationGenerator.generate_responses(
        eval_set=eval_set,
        agent_module_path=agent_module,
@@ -117,6 +126,8 @@ class AgentEvaluator:
        agent_name=agent_name,
    )

+    failures = []
+
    for eval_case_responses in eval_case_responses_list:
      actual_invocations = [
          invocation
@@ -139,10 +150,25 @@ class AgentEvaluator:
            )
        )

-        assert evaluation_result.overall_eval_status == EvalStatus.PASSED, (
-            f"{metric_name} for {agent_module} Failed. Expected {threshold},"
-            f" but got {evaluation_result.overall_score}."
-        )
+        if print_detailed_results:
+          AgentEvaluator._print_details(
+              evaluation_result=evaluation_result,
+              metric_name=metric_name,
+              threshold=threshold,
+          )
+
+        # Gather all the failures.
+        if evaluation_result.overall_eval_status != EvalStatus.PASSED:
+          failures.append(
+              f"{metric_name} for {agent_module} Failed. Expected {threshold},"
+              f" but got {evaluation_result.overall_score}."
+          )
+
+    assert not failures, (
+        "Following are all the test failures. If you looking to get more"
+        " details on the failures, then please re-run this test with"
+        " `print_details` set to `True`.\n{}".format("\n".join(failures))
+    )

  @staticmethod
  async def evaluate(
@@ -158,9 +184,10 @@ class AgentEvaluator:
      agent_module: The path to python module that contains the definition of
        the agent. There is convention in place here, where the code is going to
        look for 'root_agent' in the loaded module.
-      eval_dataset_file_path_or_dir: The eval data set. This can be either a string representing
-        full path to the file containing eval dataset, or a directory that is
-        recursively explored for all files that have a `.test.json` suffix.
+      eval_dataset_file_path_or_dir: The eval data set. This can be either a
+        string representing full path to the file containing eval dataset, or a
+        directory that is recursively explored for all files that have a
+        `.test.json` suffix.
      num_runs: Number of times all entries in the eval dataset should be
        assessed.
      agent_name: The name of the agent.
@@ -358,6 +385,11 @@ class AgentEvaluator:

  @staticmethod
  def _get_metric_evaluator(metric_name: str, threshold: float) -> Evaluator:
+    try:
+      from .response_evaluator import ResponseEvaluator
+      from .trajectory_evaluator import TrajectoryEvaluator
+    except ModuleNotFoundError as e:
+      raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
    if metric_name == TOOL_TRAJECTORY_SCORE_KEY:
      return TrajectoryEvaluator(threshold=threshold)
    elif (
@@ -367,3 +399,60 @@ class AgentEvaluator:
      return ResponseEvaluator(threshold=threshold, metric_name=metric_name)

    raise ValueError(f"Unsupported eval metric: {metric_name}")
+
+  @staticmethod
+  def _print_details(
+      evaluation_result: EvaluationResult, metric_name: str, threshold: float
+  ):
+    try:
+      from pandas import pandas as pd
+      from tabulate import tabulate
+    except ModuleNotFoundError as e:
+      raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
+    print(
+        f"Summary: `{evaluation_result.overall_eval_status}` for Metric:"
+        f" `{metric_name}`. Expected threshold: `{threshold}`, actual value:"
+        f" `{evaluation_result.overall_score}`."
+    )
+
+    data = []
+    for per_invocation_result in evaluation_result.per_invocation_results:
+      data.append({
+          "eval_status": per_invocation_result.eval_status,
+          "score": per_invocation_result.score,
+          "threshold": threshold,
+          "prompt": AgentEvaluator._convert_content_to_text(
+              per_invocation_result.expected_invocation.user_content
+          ),
+          "expected_response": AgentEvaluator._convert_content_to_text(
+              per_invocation_result.expected_invocation.final_response
+          ),
+          "actual_response": AgentEvaluator._convert_content_to_text(
+              per_invocation_result.actual_invocation.final_response
+          ),
+          "expected_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
+              per_invocation_result.expected_invocation.intermediate_data
+          ),
+          "actual_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
+              per_invocation_result.actual_invocation.intermediate_data
+          ),
+      })
+
+    print(tabulate(pd.DataFrame(data), headers="keys", tablefmt="grid"))
+    print("\n\n")  # Few empty lines for visual clarity
+
+  @staticmethod
+  def _convert_content_to_text(content: Optional[genai_types.Content]) -> str:
+    if content and content.parts:
+      return "\n".join([p.text for p in content.parts if p.text])
+
+    return ""
+
+  @staticmethod
+  def _convert_tool_calls_to_text(
+      intermediate_data: Optional[IntermediateData],
+  ) -> str:
+    if intermediate_data and intermediate_data.tool_uses:
+      return "\n".join([str(t) for t in intermediate_data.tool_uses])
+
+    return ""
@@ -0,0 +1,20 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+MISSING_EVAL_DEPENDENCIES_MESSAGE = (
+    "Eval module is not installed, please install via `pip install"
+    " google-adk[eval]`."
+)
@@ -182,7 +182,7 @@ class EvaluationGenerator:
      tool_uses = []
      invocation_id = ""

-      for event in runner.run(
+      async for event in runner.run_async(
          user_id=user_id, session_id=session_id, new_message=user_content
      ):
        invocation_id = (
@@ -1,13 +0,0 @@
-{
-  "id": "test_id",
-  "app_name": "trip_planner_agent",
-  "user_id": "test_user",
-  "state": {
-    "origin": "San Francisco",
-    "interests": "Food, Shopping, Museums",
-    "range": "1000 miles",
-    "cities": ""
-  },
-  "events": [],
-  "last_update_time": 1741218714.258285
-}
@@ -1,19 +1,116 @@
-[
-  {
-    "query": "Hi, who are you? What can you do?",
-    "expected_tool_use": [],
-    "reference": "I am trip_planner, and my goal is to plan the best trip ever. I can describe why a city was chosen, list its top attractions, and provide a detailed itinerary for each day of the trip.\n"
-  },
-  {
-    "query": "I want to travel from San Francisco to an European country in fall next year. I am considering London and Paris. What is your advice?",
-    "expected_tool_use": [
-      {
-        "tool_name": "transfer_to_agent",
-        "tool_input": {
-          "agent_name": "indentify_agent"
+{
+  "eval_set_id": "e7996ccc-16bc-46bf-9a24-0a3ecc3dacd7",
+  "name": "e7996ccc-16bc-46bf-9a24-0a3ecc3dacd7",
+  "description": null,
+  "eval_cases": [
+    {
+      "eval_id": "/google/src/cloud/ankusharma/CS-agent_evaluator-2025-06-17_115009/google3/third_party/py/google/adk/open_source_workspace/tests/integration/fixture/trip_planner_agent/trip_inquiry.test.json",
+      "conversation": [
+        {
+          "invocation_id": "d7ff8ec1-290b-48c5-b3aa-05cb8f27b8ae",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "inline_data": null,
+                "file_data": null,
+                "thought_signature": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "function_call": null,
+                "function_response": null,
+                "text": "Hi, who are you? What can you do?"
+              }
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "inline_data": null,
+                "file_data": null,
+                "thought_signature": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "function_call": null,
+                "function_response": null,
+                "text": "I am trip_planner, and my goal is to plan the best trip ever. I can describe why a city was chosen, list its top attractions, and provide a detailed itinerary for each day of the trip.\n"
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1750190885.419684
+        },
+        {
+          "invocation_id": "f515ff57-ff21-488f-ab92-7d7de5bb76fe",
+          "user_content": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "inline_data": null,
+                "file_data": null,
+                "thought_signature": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "function_call": null,
+                "function_response": null,
+                "text": "I want to travel from San Francisco to an European country in fall next year. I am considering London and Paris. What is your advice?"
+              }
+            ],
+            "role": "user"
+          },
+          "final_response": {
+            "parts": [
+              {
+                "video_metadata": null,
+                "thought": null,
+                "inline_data": null,
+                "file_data": null,
+                "thought_signature": null,
+                "code_execution_result": null,
+                "executable_code": null,
+                "function_call": null,
+                "function_response": null,
+                "text": "Okay, I can help you analyze London and Paris to determine which city is better for your trip next fall. I will consider weather patterns, seasonal events, travel costs (including flights from San Francisco), and your interests (food, shopping, and museums). After gathering this information, I'll provide a detailed report on my chosen city.\n"
+              }
+            ],
+            "role": "model"
+          },
+          "intermediate_data": {
+            "tool_uses": [
+              {
+                "id": null,
+                "args": {
+                  "agent_name": "indentify_agent"
+                },
+                "name": "transfer_to_agent"
+              }
+            ],
+            "intermediate_responses": []
+          },
+          "creation_timestamp": 1750190885.4197457
        }
-      }
-    ],
-    "reference": "Okay, I can help you analyze London and Paris to determine which city is better for your trip next fall. I will consider weather patterns, seasonal events, travel costs (including flights from San Francisco), and your interests (food, shopping, and museums). After gathering this information, I'll provide a detailed report on my chosen city.\n"
-  }
-]
+      ],
+      "session_input": {
+        "app_name": "trip_planner_agent",
+        "user_id": "test_user",
+        "state": {
+          "origin": "San Francisco",
+          "interests": "Food, Shopping, Museums",
+          "range": "1000 miles",
+          "cities": ""
+        }
+      },
+      "creation_timestamp": 1750190885.4197533
+    }
+  ],
+  "creation_timestamp": 1750190885.4197605
+}