You've already forked adk-python
mirror of
https://github.com/encounter/adk-python.git
synced 2026-03-30 10:57:20 -07:00
fix: Adding detailed information on each metric evaluation
Additionally, few other small changes. * Updated a test fixture to support the latest eval data schema. Somehow I missed doing that previously. * Updated the `evaluation_generator.py` to use `run_async`, instead of `run`. * Also, raise an informed error when dependencies required eval are not installed. * Also, changed the behavior of AgentEvaluator.evaluate method to run all the evals, instead of failing at the first eval metric failure. PiperOrigin-RevId: 775919127
This commit is contained in:
committed by
Copybara-Service
parent
3901fade71
commit
04de3e197d
@@ -26,6 +26,7 @@ import uuid
|
||||
|
||||
from ..agents import Agent
|
||||
from ..artifacts.base_artifact_service import BaseArtifactService
|
||||
from ..evaluation.constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
|
||||
from ..evaluation.eval_case import EvalCase
|
||||
from ..evaluation.eval_metrics import EvalMetric
|
||||
from ..evaluation.eval_metrics import EvalMetricResult
|
||||
@@ -38,10 +39,6 @@ from ..sessions.base_session_service import BaseSessionService
|
||||
logger = logging.getLogger("google_adk." + __name__)
|
||||
|
||||
|
||||
MISSING_EVAL_DEPENDENCIES_MESSAGE = (
|
||||
"Eval module is not installed, please install via `pip install"
|
||||
" google-adk[eval]`."
|
||||
)
|
||||
TOOL_TRAJECTORY_SCORE_KEY = "tool_trajectory_avg_score"
|
||||
RESPONSE_MATCH_SCORE_KEY = "response_match_score"
|
||||
# This evaluation is not very stable.
|
||||
@@ -150,7 +147,7 @@ async def run_evals(
|
||||
artifact_service: The artifact service to use during inferencing.
|
||||
"""
|
||||
try:
|
||||
from ..evaluation.agent_evaluator import EvaluationGenerator
|
||||
from ..evaluation.evaluation_generator import EvaluationGenerator
|
||||
except ModuleNotFoundError as e:
|
||||
raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
|
||||
|
||||
@@ -252,7 +249,8 @@ async def run_evals(
|
||||
result = "❌ Failed"
|
||||
|
||||
print(f"Result: {result}\n")
|
||||
|
||||
except ModuleNotFoundError as e:
|
||||
raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
|
||||
except Exception:
|
||||
# Catching the general exception, so that we don't block other eval
|
||||
# cases.
|
||||
|
||||
@@ -31,12 +31,12 @@ import uvicorn
|
||||
from . import cli_create
|
||||
from . import cli_deploy
|
||||
from .. import version
|
||||
from ..evaluation.constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
|
||||
from ..evaluation.gcs_eval_set_results_manager import GcsEvalSetResultsManager
|
||||
from ..evaluation.gcs_eval_sets_manager import GcsEvalSetsManager
|
||||
from ..evaluation.local_eval_set_results_manager import LocalEvalSetResultsManager
|
||||
from ..sessions.in_memory_session_service import InMemorySessionService
|
||||
from .cli import run_cli
|
||||
from .cli_eval import MISSING_EVAL_DEPENDENCIES_MESSAGE
|
||||
from .fast_api import get_fast_api_app
|
||||
from .utils import envs
|
||||
from .utils import evals
|
||||
|
||||
@@ -12,6 +12,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
@@ -23,16 +25,16 @@ from typing import Optional
|
||||
from typing import Union
|
||||
import uuid
|
||||
|
||||
from google.genai import types as genai_types
|
||||
from pydantic import ValidationError
|
||||
|
||||
from .constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
|
||||
from .eval_case import IntermediateData
|
||||
from .eval_set import EvalSet
|
||||
from .evaluation_generator import EvaluationGenerator
|
||||
from .evaluator import EvalStatus
|
||||
from .evaluator import EvaluationResult
|
||||
from .evaluator import Evaluator
|
||||
from .local_eval_sets_manager import convert_eval_set_to_pydanctic_schema
|
||||
from .response_evaluator import ResponseEvaluator
|
||||
from .trajectory_evaluator import TrajectoryEvaluator
|
||||
|
||||
logger = logging.getLogger("google_adk." + __name__)
|
||||
|
||||
@@ -96,6 +98,7 @@ class AgentEvaluator:
|
||||
criteria: dict[str, float],
|
||||
num_runs=NUM_RUNS,
|
||||
agent_name=None,
|
||||
print_detailed_results: bool = True,
|
||||
):
|
||||
"""Evaluates an agent using the given EvalSet.
|
||||
|
||||
@@ -109,7 +112,13 @@ class AgentEvaluator:
|
||||
num_runs: Number of times all entries in the eval dataset should be
|
||||
assessed.
|
||||
agent_name: The name of the agent.
|
||||
print_detailed_results: Whether to print detailed results for each metric
|
||||
evaluation.
|
||||
"""
|
||||
try:
|
||||
from .evaluation_generator import EvaluationGenerator
|
||||
except ModuleNotFoundError as e:
|
||||
raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
|
||||
eval_case_responses_list = await EvaluationGenerator.generate_responses(
|
||||
eval_set=eval_set,
|
||||
agent_module_path=agent_module,
|
||||
@@ -117,6 +126,8 @@ class AgentEvaluator:
|
||||
agent_name=agent_name,
|
||||
)
|
||||
|
||||
failures = []
|
||||
|
||||
for eval_case_responses in eval_case_responses_list:
|
||||
actual_invocations = [
|
||||
invocation
|
||||
@@ -139,10 +150,25 @@ class AgentEvaluator:
|
||||
)
|
||||
)
|
||||
|
||||
assert evaluation_result.overall_eval_status == EvalStatus.PASSED, (
|
||||
f"{metric_name} for {agent_module} Failed. Expected {threshold},"
|
||||
f" but got {evaluation_result.overall_score}."
|
||||
)
|
||||
if print_detailed_results:
|
||||
AgentEvaluator._print_details(
|
||||
evaluation_result=evaluation_result,
|
||||
metric_name=metric_name,
|
||||
threshold=threshold,
|
||||
)
|
||||
|
||||
# Gather all the failures.
|
||||
if evaluation_result.overall_eval_status != EvalStatus.PASSED:
|
||||
failures.append(
|
||||
f"{metric_name} for {agent_module} Failed. Expected {threshold},"
|
||||
f" but got {evaluation_result.overall_score}."
|
||||
)
|
||||
|
||||
assert not failures, (
|
||||
"Following are all the test failures. If you looking to get more"
|
||||
" details on the failures, then please re-run this test with"
|
||||
" `print_details` set to `True`.\n{}".format("\n".join(failures))
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
async def evaluate(
|
||||
@@ -158,9 +184,10 @@ class AgentEvaluator:
|
||||
agent_module: The path to python module that contains the definition of
|
||||
the agent. There is convention in place here, where the code is going to
|
||||
look for 'root_agent' in the loaded module.
|
||||
eval_dataset_file_path_or_dir: The eval data set. This can be either a string representing
|
||||
full path to the file containing eval dataset, or a directory that is
|
||||
recursively explored for all files that have a `.test.json` suffix.
|
||||
eval_dataset_file_path_or_dir: The eval data set. This can be either a
|
||||
string representing full path to the file containing eval dataset, or a
|
||||
directory that is recursively explored for all files that have a
|
||||
`.test.json` suffix.
|
||||
num_runs: Number of times all entries in the eval dataset should be
|
||||
assessed.
|
||||
agent_name: The name of the agent.
|
||||
@@ -358,6 +385,11 @@ class AgentEvaluator:
|
||||
|
||||
@staticmethod
|
||||
def _get_metric_evaluator(metric_name: str, threshold: float) -> Evaluator:
|
||||
try:
|
||||
from .response_evaluator import ResponseEvaluator
|
||||
from .trajectory_evaluator import TrajectoryEvaluator
|
||||
except ModuleNotFoundError as e:
|
||||
raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
|
||||
if metric_name == TOOL_TRAJECTORY_SCORE_KEY:
|
||||
return TrajectoryEvaluator(threshold=threshold)
|
||||
elif (
|
||||
@@ -367,3 +399,60 @@ class AgentEvaluator:
|
||||
return ResponseEvaluator(threshold=threshold, metric_name=metric_name)
|
||||
|
||||
raise ValueError(f"Unsupported eval metric: {metric_name}")
|
||||
|
||||
@staticmethod
|
||||
def _print_details(
|
||||
evaluation_result: EvaluationResult, metric_name: str, threshold: float
|
||||
):
|
||||
try:
|
||||
from pandas import pandas as pd
|
||||
from tabulate import tabulate
|
||||
except ModuleNotFoundError as e:
|
||||
raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
|
||||
print(
|
||||
f"Summary: `{evaluation_result.overall_eval_status}` for Metric:"
|
||||
f" `{metric_name}`. Expected threshold: `{threshold}`, actual value:"
|
||||
f" `{evaluation_result.overall_score}`."
|
||||
)
|
||||
|
||||
data = []
|
||||
for per_invocation_result in evaluation_result.per_invocation_results:
|
||||
data.append({
|
||||
"eval_status": per_invocation_result.eval_status,
|
||||
"score": per_invocation_result.score,
|
||||
"threshold": threshold,
|
||||
"prompt": AgentEvaluator._convert_content_to_text(
|
||||
per_invocation_result.expected_invocation.user_content
|
||||
),
|
||||
"expected_response": AgentEvaluator._convert_content_to_text(
|
||||
per_invocation_result.expected_invocation.final_response
|
||||
),
|
||||
"actual_response": AgentEvaluator._convert_content_to_text(
|
||||
per_invocation_result.actual_invocation.final_response
|
||||
),
|
||||
"expected_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
|
||||
per_invocation_result.expected_invocation.intermediate_data
|
||||
),
|
||||
"actual_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
|
||||
per_invocation_result.actual_invocation.intermediate_data
|
||||
),
|
||||
})
|
||||
|
||||
print(tabulate(pd.DataFrame(data), headers="keys", tablefmt="grid"))
|
||||
print("\n\n") # Few empty lines for visual clarity
|
||||
|
||||
@staticmethod
|
||||
def _convert_content_to_text(content: Optional[genai_types.Content]) -> str:
|
||||
if content and content.parts:
|
||||
return "\n".join([p.text for p in content.parts if p.text])
|
||||
|
||||
return ""
|
||||
|
||||
@staticmethod
|
||||
def _convert_tool_calls_to_text(
|
||||
intermediate_data: Optional[IntermediateData],
|
||||
) -> str:
|
||||
if intermediate_data and intermediate_data.tool_uses:
|
||||
return "\n".join([str(t) for t in intermediate_data.tool_uses])
|
||||
|
||||
return ""
|
||||
|
||||
@@ -0,0 +1,20 @@
|
||||
# Copyright 2025 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
MISSING_EVAL_DEPENDENCIES_MESSAGE = (
|
||||
"Eval module is not installed, please install via `pip install"
|
||||
" google-adk[eval]`."
|
||||
)
|
||||
@@ -182,7 +182,7 @@ class EvaluationGenerator:
|
||||
tool_uses = []
|
||||
invocation_id = ""
|
||||
|
||||
for event in runner.run(
|
||||
async for event in runner.run_async(
|
||||
user_id=user_id, session_id=session_id, new_message=user_content
|
||||
):
|
||||
invocation_id = (
|
||||
|
||||
@@ -1,13 +0,0 @@
|
||||
{
|
||||
"id": "test_id",
|
||||
"app_name": "trip_planner_agent",
|
||||
"user_id": "test_user",
|
||||
"state": {
|
||||
"origin": "San Francisco",
|
||||
"interests": "Food, Shopping, Museums",
|
||||
"range": "1000 miles",
|
||||
"cities": ""
|
||||
},
|
||||
"events": [],
|
||||
"last_update_time": 1741218714.258285
|
||||
}
|
||||
@@ -1,19 +1,116 @@
|
||||
[
|
||||
{
|
||||
"query": "Hi, who are you? What can you do?",
|
||||
"expected_tool_use": [],
|
||||
"reference": "I am trip_planner, and my goal is to plan the best trip ever. I can describe why a city was chosen, list its top attractions, and provide a detailed itinerary for each day of the trip.\n"
|
||||
},
|
||||
{
|
||||
"query": "I want to travel from San Francisco to an European country in fall next year. I am considering London and Paris. What is your advice?",
|
||||
"expected_tool_use": [
|
||||
{
|
||||
"tool_name": "transfer_to_agent",
|
||||
"tool_input": {
|
||||
"agent_name": "indentify_agent"
|
||||
{
|
||||
"eval_set_id": "e7996ccc-16bc-46bf-9a24-0a3ecc3dacd7",
|
||||
"name": "e7996ccc-16bc-46bf-9a24-0a3ecc3dacd7",
|
||||
"description": null,
|
||||
"eval_cases": [
|
||||
{
|
||||
"eval_id": "/google/src/cloud/ankusharma/CS-agent_evaluator-2025-06-17_115009/google3/third_party/py/google/adk/open_source_workspace/tests/integration/fixture/trip_planner_agent/trip_inquiry.test.json",
|
||||
"conversation": [
|
||||
{
|
||||
"invocation_id": "d7ff8ec1-290b-48c5-b3aa-05cb8f27b8ae",
|
||||
"user_content": {
|
||||
"parts": [
|
||||
{
|
||||
"video_metadata": null,
|
||||
"thought": null,
|
||||
"inline_data": null,
|
||||
"file_data": null,
|
||||
"thought_signature": null,
|
||||
"code_execution_result": null,
|
||||
"executable_code": null,
|
||||
"function_call": null,
|
||||
"function_response": null,
|
||||
"text": "Hi, who are you? What can you do?"
|
||||
}
|
||||
],
|
||||
"role": "user"
|
||||
},
|
||||
"final_response": {
|
||||
"parts": [
|
||||
{
|
||||
"video_metadata": null,
|
||||
"thought": null,
|
||||
"inline_data": null,
|
||||
"file_data": null,
|
||||
"thought_signature": null,
|
||||
"code_execution_result": null,
|
||||
"executable_code": null,
|
||||
"function_call": null,
|
||||
"function_response": null,
|
||||
"text": "I am trip_planner, and my goal is to plan the best trip ever. I can describe why a city was chosen, list its top attractions, and provide a detailed itinerary for each day of the trip.\n"
|
||||
}
|
||||
],
|
||||
"role": "model"
|
||||
},
|
||||
"intermediate_data": {
|
||||
"tool_uses": [],
|
||||
"intermediate_responses": []
|
||||
},
|
||||
"creation_timestamp": 1750190885.419684
|
||||
},
|
||||
{
|
||||
"invocation_id": "f515ff57-ff21-488f-ab92-7d7de5bb76fe",
|
||||
"user_content": {
|
||||
"parts": [
|
||||
{
|
||||
"video_metadata": null,
|
||||
"thought": null,
|
||||
"inline_data": null,
|
||||
"file_data": null,
|
||||
"thought_signature": null,
|
||||
"code_execution_result": null,
|
||||
"executable_code": null,
|
||||
"function_call": null,
|
||||
"function_response": null,
|
||||
"text": "I want to travel from San Francisco to an European country in fall next year. I am considering London and Paris. What is your advice?"
|
||||
}
|
||||
],
|
||||
"role": "user"
|
||||
},
|
||||
"final_response": {
|
||||
"parts": [
|
||||
{
|
||||
"video_metadata": null,
|
||||
"thought": null,
|
||||
"inline_data": null,
|
||||
"file_data": null,
|
||||
"thought_signature": null,
|
||||
"code_execution_result": null,
|
||||
"executable_code": null,
|
||||
"function_call": null,
|
||||
"function_response": null,
|
||||
"text": "Okay, I can help you analyze London and Paris to determine which city is better for your trip next fall. I will consider weather patterns, seasonal events, travel costs (including flights from San Francisco), and your interests (food, shopping, and museums). After gathering this information, I'll provide a detailed report on my chosen city.\n"
|
||||
}
|
||||
],
|
||||
"role": "model"
|
||||
},
|
||||
"intermediate_data": {
|
||||
"tool_uses": [
|
||||
{
|
||||
"id": null,
|
||||
"args": {
|
||||
"agent_name": "indentify_agent"
|
||||
},
|
||||
"name": "transfer_to_agent"
|
||||
}
|
||||
],
|
||||
"intermediate_responses": []
|
||||
},
|
||||
"creation_timestamp": 1750190885.4197457
|
||||
}
|
||||
}
|
||||
],
|
||||
"reference": "Okay, I can help you analyze London and Paris to determine which city is better for your trip next fall. I will consider weather patterns, seasonal events, travel costs (including flights from San Francisco), and your interests (food, shopping, and museums). After gathering this information, I'll provide a detailed report on my chosen city.\n"
|
||||
}
|
||||
]
|
||||
],
|
||||
"session_input": {
|
||||
"app_name": "trip_planner_agent",
|
||||
"user_id": "test_user",
|
||||
"state": {
|
||||
"origin": "San Francisco",
|
||||
"interests": "Food, Shopping, Museums",
|
||||
"range": "1000 miles",
|
||||
"cities": ""
|
||||
}
|
||||
},
|
||||
"creation_timestamp": 1750190885.4197533
|
||||
}
|
||||
],
|
||||
"creation_timestamp": 1750190885.4197605
|
||||
}
|
||||
Reference in New Issue
Block a user