From 04de3e197d7a57935488eb7bfa647c7ab62cd9d9 Mon Sep 17 00:00:00 2001 From: Ankur Sharma Date: Wed, 25 Jun 2025 18:31:25 -0700 Subject: [PATCH] fix: Adding detailed information on each metric evaluation Additionally, few other small changes. * Updated a test fixture to support the latest eval data schema. Somehow I missed doing that previously. * Updated the `evaluation_generator.py` to use `run_async`, instead of `run`. * Also, raise an informed error when dependencies required eval are not installed. * Also, changed the behavior of AgentEvaluator.evaluate method to run all the evals, instead of failing at the first eval metric failure. PiperOrigin-RevId: 775919127 --- src/google/adk/cli/cli_eval.py | 10 +- src/google/adk/cli/cli_tools_click.py | 2 +- src/google/adk/evaluation/agent_evaluator.py | 109 ++++++++++++-- src/google/adk/evaluation/constants.py | 20 +++ .../adk/evaluation/evaluation_generator.py | 2 +- .../trip_planner_agent/initial.session.json | 13 -- .../trip_planner_agent/trip_inquiry.test.json | 133 +++++++++++++++--- 7 files changed, 240 insertions(+), 49 deletions(-) create mode 100644 src/google/adk/evaluation/constants.py delete mode 100644 tests/integration/fixture/trip_planner_agent/initial.session.json diff --git a/src/google/adk/cli/cli_eval.py b/src/google/adk/cli/cli_eval.py index 13e205cb..01b06135 100644 --- a/src/google/adk/cli/cli_eval.py +++ b/src/google/adk/cli/cli_eval.py @@ -26,6 +26,7 @@ import uuid from ..agents import Agent from ..artifacts.base_artifact_service import BaseArtifactService +from ..evaluation.constants import MISSING_EVAL_DEPENDENCIES_MESSAGE from ..evaluation.eval_case import EvalCase from ..evaluation.eval_metrics import EvalMetric from ..evaluation.eval_metrics import EvalMetricResult @@ -38,10 +39,6 @@ from ..sessions.base_session_service import BaseSessionService logger = logging.getLogger("google_adk." + __name__) -MISSING_EVAL_DEPENDENCIES_MESSAGE = ( - "Eval module is not installed, please install via `pip install" - " google-adk[eval]`." -) TOOL_TRAJECTORY_SCORE_KEY = "tool_trajectory_avg_score" RESPONSE_MATCH_SCORE_KEY = "response_match_score" # This evaluation is not very stable. @@ -150,7 +147,7 @@ async def run_evals( artifact_service: The artifact service to use during inferencing. """ try: - from ..evaluation.agent_evaluator import EvaluationGenerator + from ..evaluation.evaluation_generator import EvaluationGenerator except ModuleNotFoundError as e: raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e @@ -252,7 +249,8 @@ async def run_evals( result = "❌ Failed" print(f"Result: {result}\n") - + except ModuleNotFoundError as e: + raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e except Exception: # Catching the general exception, so that we don't block other eval # cases. diff --git a/src/google/adk/cli/cli_tools_click.py b/src/google/adk/cli/cli_tools_click.py index c0935cce..1bc7d566 100644 --- a/src/google/adk/cli/cli_tools_click.py +++ b/src/google/adk/cli/cli_tools_click.py @@ -31,12 +31,12 @@ import uvicorn from . import cli_create from . import cli_deploy from .. import version +from ..evaluation.constants import MISSING_EVAL_DEPENDENCIES_MESSAGE from ..evaluation.gcs_eval_set_results_manager import GcsEvalSetResultsManager from ..evaluation.gcs_eval_sets_manager import GcsEvalSetsManager from ..evaluation.local_eval_set_results_manager import LocalEvalSetResultsManager from ..sessions.in_memory_session_service import InMemorySessionService from .cli import run_cli -from .cli_eval import MISSING_EVAL_DEPENDENCIES_MESSAGE from .fast_api import get_fast_api_app from .utils import envs from .utils import evals diff --git a/src/google/adk/evaluation/agent_evaluator.py b/src/google/adk/evaluation/agent_evaluator.py index 6ee001f9..486d01cf 100644 --- a/src/google/adk/evaluation/agent_evaluator.py +++ b/src/google/adk/evaluation/agent_evaluator.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + import json import logging import os @@ -23,16 +25,16 @@ from typing import Optional from typing import Union import uuid +from google.genai import types as genai_types from pydantic import ValidationError +from .constants import MISSING_EVAL_DEPENDENCIES_MESSAGE +from .eval_case import IntermediateData from .eval_set import EvalSet -from .evaluation_generator import EvaluationGenerator from .evaluator import EvalStatus from .evaluator import EvaluationResult from .evaluator import Evaluator from .local_eval_sets_manager import convert_eval_set_to_pydanctic_schema -from .response_evaluator import ResponseEvaluator -from .trajectory_evaluator import TrajectoryEvaluator logger = logging.getLogger("google_adk." + __name__) @@ -96,6 +98,7 @@ class AgentEvaluator: criteria: dict[str, float], num_runs=NUM_RUNS, agent_name=None, + print_detailed_results: bool = True, ): """Evaluates an agent using the given EvalSet. @@ -109,7 +112,13 @@ class AgentEvaluator: num_runs: Number of times all entries in the eval dataset should be assessed. agent_name: The name of the agent. + print_detailed_results: Whether to print detailed results for each metric + evaluation. """ + try: + from .evaluation_generator import EvaluationGenerator + except ModuleNotFoundError as e: + raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e eval_case_responses_list = await EvaluationGenerator.generate_responses( eval_set=eval_set, agent_module_path=agent_module, @@ -117,6 +126,8 @@ class AgentEvaluator: agent_name=agent_name, ) + failures = [] + for eval_case_responses in eval_case_responses_list: actual_invocations = [ invocation @@ -139,10 +150,25 @@ class AgentEvaluator: ) ) - assert evaluation_result.overall_eval_status == EvalStatus.PASSED, ( - f"{metric_name} for {agent_module} Failed. Expected {threshold}," - f" but got {evaluation_result.overall_score}." - ) + if print_detailed_results: + AgentEvaluator._print_details( + evaluation_result=evaluation_result, + metric_name=metric_name, + threshold=threshold, + ) + + # Gather all the failures. + if evaluation_result.overall_eval_status != EvalStatus.PASSED: + failures.append( + f"{metric_name} for {agent_module} Failed. Expected {threshold}," + f" but got {evaluation_result.overall_score}." + ) + + assert not failures, ( + "Following are all the test failures. If you looking to get more" + " details on the failures, then please re-run this test with" + " `print_details` set to `True`.\n{}".format("\n".join(failures)) + ) @staticmethod async def evaluate( @@ -158,9 +184,10 @@ class AgentEvaluator: agent_module: The path to python module that contains the definition of the agent. There is convention in place here, where the code is going to look for 'root_agent' in the loaded module. - eval_dataset_file_path_or_dir: The eval data set. This can be either a string representing - full path to the file containing eval dataset, or a directory that is - recursively explored for all files that have a `.test.json` suffix. + eval_dataset_file_path_or_dir: The eval data set. This can be either a + string representing full path to the file containing eval dataset, or a + directory that is recursively explored for all files that have a + `.test.json` suffix. num_runs: Number of times all entries in the eval dataset should be assessed. agent_name: The name of the agent. @@ -358,6 +385,11 @@ class AgentEvaluator: @staticmethod def _get_metric_evaluator(metric_name: str, threshold: float) -> Evaluator: + try: + from .response_evaluator import ResponseEvaluator + from .trajectory_evaluator import TrajectoryEvaluator + except ModuleNotFoundError as e: + raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e if metric_name == TOOL_TRAJECTORY_SCORE_KEY: return TrajectoryEvaluator(threshold=threshold) elif ( @@ -367,3 +399,60 @@ class AgentEvaluator: return ResponseEvaluator(threshold=threshold, metric_name=metric_name) raise ValueError(f"Unsupported eval metric: {metric_name}") + + @staticmethod + def _print_details( + evaluation_result: EvaluationResult, metric_name: str, threshold: float + ): + try: + from pandas import pandas as pd + from tabulate import tabulate + except ModuleNotFoundError as e: + raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e + print( + f"Summary: `{evaluation_result.overall_eval_status}` for Metric:" + f" `{metric_name}`. Expected threshold: `{threshold}`, actual value:" + f" `{evaluation_result.overall_score}`." + ) + + data = [] + for per_invocation_result in evaluation_result.per_invocation_results: + data.append({ + "eval_status": per_invocation_result.eval_status, + "score": per_invocation_result.score, + "threshold": threshold, + "prompt": AgentEvaluator._convert_content_to_text( + per_invocation_result.expected_invocation.user_content + ), + "expected_response": AgentEvaluator._convert_content_to_text( + per_invocation_result.expected_invocation.final_response + ), + "actual_response": AgentEvaluator._convert_content_to_text( + per_invocation_result.actual_invocation.final_response + ), + "expected_tool_calls": AgentEvaluator._convert_tool_calls_to_text( + per_invocation_result.expected_invocation.intermediate_data + ), + "actual_tool_calls": AgentEvaluator._convert_tool_calls_to_text( + per_invocation_result.actual_invocation.intermediate_data + ), + }) + + print(tabulate(pd.DataFrame(data), headers="keys", tablefmt="grid")) + print("\n\n") # Few empty lines for visual clarity + + @staticmethod + def _convert_content_to_text(content: Optional[genai_types.Content]) -> str: + if content and content.parts: + return "\n".join([p.text for p in content.parts if p.text]) + + return "" + + @staticmethod + def _convert_tool_calls_to_text( + intermediate_data: Optional[IntermediateData], + ) -> str: + if intermediate_data and intermediate_data.tool_uses: + return "\n".join([str(t) for t in intermediate_data.tool_uses]) + + return "" diff --git a/src/google/adk/evaluation/constants.py b/src/google/adk/evaluation/constants.py new file mode 100644 index 00000000..74248ed1 --- /dev/null +++ b/src/google/adk/evaluation/constants.py @@ -0,0 +1,20 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +MISSING_EVAL_DEPENDENCIES_MESSAGE = ( + "Eval module is not installed, please install via `pip install" + " google-adk[eval]`." +) diff --git a/src/google/adk/evaluation/evaluation_generator.py b/src/google/adk/evaluation/evaluation_generator.py index fbf6ea8e..1359967b 100644 --- a/src/google/adk/evaluation/evaluation_generator.py +++ b/src/google/adk/evaluation/evaluation_generator.py @@ -182,7 +182,7 @@ class EvaluationGenerator: tool_uses = [] invocation_id = "" - for event in runner.run( + async for event in runner.run_async( user_id=user_id, session_id=session_id, new_message=user_content ): invocation_id = ( diff --git a/tests/integration/fixture/trip_planner_agent/initial.session.json b/tests/integration/fixture/trip_planner_agent/initial.session.json deleted file mode 100644 index b33840cd..00000000 --- a/tests/integration/fixture/trip_planner_agent/initial.session.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "id": "test_id", - "app_name": "trip_planner_agent", - "user_id": "test_user", - "state": { - "origin": "San Francisco", - "interests": "Food, Shopping, Museums", - "range": "1000 miles", - "cities": "" - }, - "events": [], - "last_update_time": 1741218714.258285 -} diff --git a/tests/integration/fixture/trip_planner_agent/trip_inquiry.test.json b/tests/integration/fixture/trip_planner_agent/trip_inquiry.test.json index c504f68e..317599c6 100644 --- a/tests/integration/fixture/trip_planner_agent/trip_inquiry.test.json +++ b/tests/integration/fixture/trip_planner_agent/trip_inquiry.test.json @@ -1,19 +1,116 @@ -[ - { - "query": "Hi, who are you? What can you do?", - "expected_tool_use": [], - "reference": "I am trip_planner, and my goal is to plan the best trip ever. I can describe why a city was chosen, list its top attractions, and provide a detailed itinerary for each day of the trip.\n" - }, - { - "query": "I want to travel from San Francisco to an European country in fall next year. I am considering London and Paris. What is your advice?", - "expected_tool_use": [ - { - "tool_name": "transfer_to_agent", - "tool_input": { - "agent_name": "indentify_agent" +{ + "eval_set_id": "e7996ccc-16bc-46bf-9a24-0a3ecc3dacd7", + "name": "e7996ccc-16bc-46bf-9a24-0a3ecc3dacd7", + "description": null, + "eval_cases": [ + { + "eval_id": "/google/src/cloud/ankusharma/CS-agent_evaluator-2025-06-17_115009/google3/third_party/py/google/adk/open_source_workspace/tests/integration/fixture/trip_planner_agent/trip_inquiry.test.json", + "conversation": [ + { + "invocation_id": "d7ff8ec1-290b-48c5-b3aa-05cb8f27b8ae", + "user_content": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "inline_data": null, + "file_data": null, + "thought_signature": null, + "code_execution_result": null, + "executable_code": null, + "function_call": null, + "function_response": null, + "text": "Hi, who are you? What can you do?" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "inline_data": null, + "file_data": null, + "thought_signature": null, + "code_execution_result": null, + "executable_code": null, + "function_call": null, + "function_response": null, + "text": "I am trip_planner, and my goal is to plan the best trip ever. I can describe why a city was chosen, list its top attractions, and provide a detailed itinerary for each day of the trip.\n" + } + ], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [], + "intermediate_responses": [] + }, + "creation_timestamp": 1750190885.419684 + }, + { + "invocation_id": "f515ff57-ff21-488f-ab92-7d7de5bb76fe", + "user_content": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "inline_data": null, + "file_data": null, + "thought_signature": null, + "code_execution_result": null, + "executable_code": null, + "function_call": null, + "function_response": null, + "text": "I want to travel from San Francisco to an European country in fall next year. I am considering London and Paris. What is your advice?" + } + ], + "role": "user" + }, + "final_response": { + "parts": [ + { + "video_metadata": null, + "thought": null, + "inline_data": null, + "file_data": null, + "thought_signature": null, + "code_execution_result": null, + "executable_code": null, + "function_call": null, + "function_response": null, + "text": "Okay, I can help you analyze London and Paris to determine which city is better for your trip next fall. I will consider weather patterns, seasonal events, travel costs (including flights from San Francisco), and your interests (food, shopping, and museums). After gathering this information, I'll provide a detailed report on my chosen city.\n" + } + ], + "role": "model" + }, + "intermediate_data": { + "tool_uses": [ + { + "id": null, + "args": { + "agent_name": "indentify_agent" + }, + "name": "transfer_to_agent" + } + ], + "intermediate_responses": [] + }, + "creation_timestamp": 1750190885.4197457 } - } - ], - "reference": "Okay, I can help you analyze London and Paris to determine which city is better for your trip next fall. I will consider weather patterns, seasonal events, travel costs (including flights from San Francisco), and your interests (food, shopping, and museums). After gathering this information, I'll provide a detailed report on my chosen city.\n" - } -] + ], + "session_input": { + "app_name": "trip_planner_agent", + "user_id": "test_user", + "state": { + "origin": "San Francisco", + "interests": "Food, Shopping, Museums", + "range": "1000 miles", + "cities": "" + } + }, + "creation_timestamp": 1750190885.4197533 + } + ], + "creation_timestamp": 1750190885.4197605 +} \ No newline at end of file