You've already forked adk-python
mirror of
https://github.com/encounter/adk-python.git
synced 2026-03-30 10:57:20 -07:00
feat: Adds Rubric based final response evaluator
The evaluator uses a set of rubrics to assess the quality of the agent's final response. PiperOrigin-RevId: 811154498
This commit is contained in:
committed by
Copybara-Service
parent
01923a9227
commit
5a485b01cd
@@ -47,3 +47,17 @@ class AppDetails(EvalBaseModel):
|
||||
default_factory=dict,
|
||||
)
|
||||
"""A mapping from the agent name to the details of that agent."""
|
||||
|
||||
def get_developer_instructions(self, agent_name: str) -> str:
|
||||
"""Returns a string containing the developer instructions."""
|
||||
if agent_name not in self.agent_details:
|
||||
raise ValueError(f"`{agent_name}` not found in the agentic system.")
|
||||
|
||||
return self.agent_details[agent_name].instructions
|
||||
|
||||
def get_tools_by_agent_name(self) -> dict[str, genai_types.ToolListUnion]:
|
||||
"""Returns a dictionary of tools available to an agent in the App, keyed to the name of the Agent."""
|
||||
return {
|
||||
name: details.tool_declarations
|
||||
for name, details in self.agent_details.items()
|
||||
}
|
||||
|
||||
@@ -168,3 +168,54 @@ def get_all_tool_calls(
|
||||
)
|
||||
|
||||
return tool_calls
|
||||
|
||||
|
||||
def get_all_tool_responses(
|
||||
intermediate_data: Optional[IntermediateDataType],
|
||||
) -> list[genai_types.FunctionResponse]:
|
||||
"""A utility method to retrieve tools responses from intermediate data."""
|
||||
if not intermediate_data:
|
||||
return []
|
||||
|
||||
tool_responses = []
|
||||
if isinstance(intermediate_data, IntermediateData):
|
||||
tool_responses = intermediate_data.tool_responses
|
||||
elif isinstance(intermediate_data, InvocationEvents):
|
||||
# Go over each event in the list of events
|
||||
for invocation_event in intermediate_data.invocation_events:
|
||||
# Check if the event has content and some parts.
|
||||
if invocation_event.content and invocation_event.content.parts:
|
||||
for p in invocation_event.content.parts:
|
||||
# For each part, we check if any of those part is a function response.
|
||||
if p.function_response:
|
||||
tool_responses.append(p.function_response)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unsupported type for intermediate_data `{intermediate_data}`"
|
||||
)
|
||||
|
||||
return tool_responses
|
||||
|
||||
|
||||
ToolCallAndResponse: TypeAlias = tuple[
|
||||
genai_types.FunctionCall, Optional[genai_types.FunctionResponse]
|
||||
]
|
||||
"""A Tuple representing a Function call and corresponding optional function response."""
|
||||
|
||||
|
||||
def get_all_tool_calls_with_responses(
|
||||
intermediate_data: Optional[IntermediateDataType],
|
||||
) -> list[ToolCallAndResponse]:
|
||||
"""Returns tool calls with the corresponding responses, if available."""
|
||||
tool_responses_by_call_id: dict[str, genai_types.FunctionResponse] = {
|
||||
tool_response.id: tool_response
|
||||
for tool_response in get_all_tool_responses(intermediate_data)
|
||||
}
|
||||
|
||||
tool_call_and_responses: list[ToolCallAndResponse] = []
|
||||
|
||||
for tool_call in get_all_tool_calls(intermediate_data):
|
||||
response = tool_responses_by_call_id.get(tool_call.id, None)
|
||||
tool_call_and_responses.append((tool_call, response))
|
||||
|
||||
return tool_call_and_responses
|
||||
|
||||
@@ -48,6 +48,10 @@ class PrebuiltMetrics(Enum):
|
||||
|
||||
FINAL_RESPONSE_MATCH_V2 = "final_response_match_v2"
|
||||
|
||||
RUBRIC_BASED_FINAL_RESPONSE_QUALITY_V1 = (
|
||||
"rubric_based_final_response_quality_v1"
|
||||
)
|
||||
|
||||
|
||||
MetricName: TypeAlias = Union[str, PrebuiltMetrics]
|
||||
Threshold: TypeAlias = float
|
||||
|
||||
@@ -23,6 +23,7 @@ from typing_extensions import TypeAlias
|
||||
from .eval_case import Invocation
|
||||
from .eval_metrics import BaseCriterion
|
||||
from .eval_metrics import EvalStatus
|
||||
from .eval_rubrics import RubricScore
|
||||
|
||||
# Redefining the type here for backward compatibility.
|
||||
EvalStatus: TypeAlias = EvalStatus
|
||||
@@ -35,6 +36,7 @@ class PerInvocationResult(BaseModel):
|
||||
expected_invocation: Invocation
|
||||
score: Optional[float] = None
|
||||
eval_status: EvalStatus = EvalStatus.NOT_EVALUATED
|
||||
rubric_scores: Optional[list[RubricScore]] = None
|
||||
|
||||
|
||||
class EvaluationResult(BaseModel):
|
||||
@@ -45,6 +47,10 @@ class EvaluationResult(BaseModel):
|
||||
"""Overall status, based on each invocation."""
|
||||
|
||||
per_invocation_results: list[PerInvocationResult] = []
|
||||
"""Detailed results per invocation."""
|
||||
|
||||
overall_rubric_scores: Optional[list[RubricScore]] = None
|
||||
"""Overall rubric, based on each invocation."""
|
||||
|
||||
|
||||
class Evaluator(ABC):
|
||||
|
||||
@@ -33,6 +33,7 @@ from .eval_metrics import MetricValueInfo
|
||||
from .eval_metrics import PrebuiltMetrics
|
||||
from .evaluator import EvaluationResult
|
||||
from .evaluator import PerInvocationResult
|
||||
from .llm_as_judge import AutoRaterScore
|
||||
from .llm_as_judge import LlmAsJudge
|
||||
from .llm_as_judge_utils import get_eval_status
|
||||
from .llm_as_judge_utils import get_text_from_content
|
||||
@@ -179,17 +180,17 @@ class FinalResponseMatchV2Evaluator(LlmAsJudge):
|
||||
@override
|
||||
def convert_auto_rater_response_to_score(
|
||||
self, llm_response: LlmResponse
|
||||
) -> Optional[float]:
|
||||
) -> AutoRaterScore:
|
||||
response_text = get_text_from_content(llm_response.content)
|
||||
if response_text is None:
|
||||
return None
|
||||
return AutoRaterScore()
|
||||
label = _parse_critique(response_text)
|
||||
if label == Label.VALID:
|
||||
return 1.0
|
||||
return AutoRaterScore(score=1.0)
|
||||
elif label == Label.INVALID:
|
||||
return 0.0
|
||||
return AutoRaterScore(score=0.0)
|
||||
else:
|
||||
return None
|
||||
return AutoRaterScore()
|
||||
|
||||
@override
|
||||
def aggregate_per_invocation_samples(
|
||||
|
||||
@@ -26,15 +26,22 @@ from ..models.llm_request import LlmRequest
|
||||
from ..models.llm_response import LlmResponse
|
||||
from ..models.registry import LLMRegistry
|
||||
from ..utils.context_utils import Aclosing
|
||||
from .common import EvalBaseModel
|
||||
from .eval_case import Invocation
|
||||
from .eval_metrics import BaseCriterion
|
||||
from .eval_metrics import EvalMetric
|
||||
from .eval_metrics import RubricScore
|
||||
from .evaluator import EvaluationResult
|
||||
from .evaluator import Evaluator
|
||||
from .evaluator import PerInvocationResult
|
||||
from .llm_as_judge_utils import get_eval_status
|
||||
|
||||
|
||||
class AutoRaterScore(EvalBaseModel):
|
||||
score: Optional[float] = None
|
||||
rubric_scores: Optional[list[RubricScore]] = None
|
||||
|
||||
|
||||
class LlmAsJudge(Evaluator):
|
||||
"""Evaluator based on a LLM.
|
||||
|
||||
@@ -82,7 +89,7 @@ class LlmAsJudge(Evaluator):
|
||||
@abstractmethod
|
||||
def convert_auto_rater_response_to_score(
|
||||
self, auto_rater_response: LlmResponse
|
||||
) -> Optional[float]:
|
||||
) -> AutoRaterScore:
|
||||
"""Parses auto_rater_response and returns the corresponding score, or None if the score cannot be determined."""
|
||||
|
||||
@abstractmethod
|
||||
@@ -126,15 +133,18 @@ class LlmAsJudge(Evaluator):
|
||||
) as agen:
|
||||
async for llm_response in agen:
|
||||
# Non-streaming call, so there is only one response content.
|
||||
score = self.convert_auto_rater_response_to_score(llm_response)
|
||||
auto_rater_score = self.convert_auto_rater_response_to_score(
|
||||
llm_response
|
||||
)
|
||||
invocation_result_samples.append(
|
||||
PerInvocationResult(
|
||||
actual_invocation=actual,
|
||||
expected_invocation=expected,
|
||||
score=score,
|
||||
score=auto_rater_score.score,
|
||||
eval_status=get_eval_status(
|
||||
score, self._criterion.threshold
|
||||
auto_rater_score.score, self._eval_metric.threshold
|
||||
),
|
||||
rubric_scores=auto_rater_score.rubric_scores,
|
||||
)
|
||||
)
|
||||
if not invocation_result_samples:
|
||||
|
||||
@@ -15,10 +15,17 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import enum
|
||||
import statistics
|
||||
from typing import Optional
|
||||
from typing import Union
|
||||
|
||||
from google.genai import types as genai_types
|
||||
|
||||
from .app_details import AppDetails
|
||||
from .common import EvalBaseModel
|
||||
from .eval_case import get_all_tool_calls_with_responses
|
||||
from .eval_case import IntermediateDataType
|
||||
from .eval_metrics import RubricScore
|
||||
from .evaluator import EvalStatus
|
||||
|
||||
|
||||
@@ -46,3 +53,97 @@ def get_eval_status(score: Optional[float], threshold: float) -> EvalStatus:
|
||||
if score is None:
|
||||
return EvalStatus.NOT_EVALUATED
|
||||
return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED
|
||||
|
||||
|
||||
def get_average_rubric_score(
|
||||
rubric_scores: list[RubricScore],
|
||||
) -> Optional[float]:
|
||||
"""Returns a single score value from the given list of rubric scores.
|
||||
|
||||
It is possible that none of the rubric score actually contain a score value,
|
||||
if that happens then None is returned.
|
||||
|
||||
If non-zero score values are present, then a mean value is returned as the
|
||||
aggregated value.
|
||||
"""
|
||||
rubric_scores = [
|
||||
rubric_score.score
|
||||
for rubric_score in rubric_scores
|
||||
if rubric_score.score is not None
|
||||
]
|
||||
|
||||
return statistics.mean(rubric_scores) if rubric_scores else None
|
||||
|
||||
|
||||
class _ToolDeclarations(EvalBaseModel):
|
||||
"""Internal data model used for serializing Tool declarations."""
|
||||
|
||||
tool_declarations: dict[str, genai_types.ToolListUnion]
|
||||
|
||||
|
||||
def get_tool_declarations_as_json_str(
|
||||
app_details: AppDetails,
|
||||
) -> str:
|
||||
"""Returns a JSON string representation of Tool declarations.
|
||||
|
||||
The output of this method is usually intended to be sent to the LLM.
|
||||
"""
|
||||
tool_declarations = _ToolDeclarations(
|
||||
tool_declarations=app_details.get_tools_by_agent_name()
|
||||
)
|
||||
return tool_declarations.model_dump_json(
|
||||
indent=2,
|
||||
exclude_unset=True,
|
||||
exclude_defaults=True,
|
||||
exclude_none=True,
|
||||
)
|
||||
|
||||
|
||||
class _ToolCallAndResponse(EvalBaseModel):
|
||||
"""Internal data model to capture one single tool call and response."""
|
||||
|
||||
step: int
|
||||
tool_call: genai_types.FunctionCall
|
||||
tool_response: Union[genai_types.FunctionResponse, str]
|
||||
|
||||
|
||||
class _ToolCallsAndResponses(EvalBaseModel):
|
||||
"""Internal data model used for serializing Tool call and responses."""
|
||||
|
||||
tool_calls_and_response: list[_ToolCallAndResponse]
|
||||
|
||||
|
||||
def get_tool_calls_and_responses_as_json_str(
|
||||
intermediate_data: Optional[IntermediateDataType],
|
||||
) -> str:
|
||||
"""Returns a JSON string representation of tool calls and corresponding responses.
|
||||
|
||||
The output of this method is usually intended to be sent to the LLM.
|
||||
"""
|
||||
raw_tool_calls_and_response = get_all_tool_calls_with_responses(
|
||||
intermediate_data
|
||||
)
|
||||
|
||||
if not raw_tool_calls_and_response:
|
||||
return "No intermediate steps were taken."
|
||||
|
||||
tool_calls_and_responses = []
|
||||
for idx, (tool_call, tool_response) in enumerate(raw_tool_calls_and_response):
|
||||
tool_calls_and_responses.append(
|
||||
_ToolCallAndResponse(
|
||||
step=idx,
|
||||
tool_call=tool_call,
|
||||
tool_response=tool_response if tool_response else "None",
|
||||
)
|
||||
)
|
||||
|
||||
internal_tool_calls_and_responses = _ToolCallsAndResponses(
|
||||
tool_calls_and_response=tool_calls_and_responses
|
||||
)
|
||||
|
||||
return internal_tool_calls_and_responses.model_dump_json(
|
||||
indent=2,
|
||||
exclude_unset=True,
|
||||
exclude_defaults=True,
|
||||
exclude_none=True,
|
||||
)
|
||||
|
||||
@@ -40,6 +40,7 @@ from .base_eval_service import InferenceStatus
|
||||
from .eval_case import Invocation
|
||||
from .eval_metrics import EvalMetric
|
||||
from .eval_metrics import EvalMetricResult
|
||||
from .eval_metrics import EvalMetricResultDetails
|
||||
from .eval_metrics import EvalMetricResultPerInvocation
|
||||
from .eval_result import EvalCaseResult
|
||||
from .eval_set import EvalCase
|
||||
@@ -239,12 +240,15 @@ class LocalEvalService(BaseEvalService):
|
||||
)
|
||||
|
||||
# Track overall scrore across all invocations.
|
||||
eval_metric_result_details = EvalMetricResultDetails(
|
||||
rubric_scores=evaluation_result.overall_rubric_scores
|
||||
)
|
||||
overall_eval_metric_results.append(
|
||||
EvalMetricResult(
|
||||
metric_name=eval_metric.metric_name,
|
||||
threshold=eval_metric.threshold,
|
||||
score=evaluation_result.overall_score,
|
||||
eval_status=evaluation_result.overall_eval_status,
|
||||
details=eval_metric_result_details,
|
||||
**eval_metric.model_dump(),
|
||||
)
|
||||
)
|
||||
|
||||
@@ -262,12 +266,15 @@ class LocalEvalService(BaseEvalService):
|
||||
evaluation_result.per_invocation_results,
|
||||
eval_metric_result_per_invocation,
|
||||
):
|
||||
eval_metric_result_details = EvalMetricResultDetails(
|
||||
rubric_scores=invocation_result.rubric_scores
|
||||
)
|
||||
invocation.eval_metric_results.append(
|
||||
EvalMetricResult(
|
||||
metric_name=eval_metric.metric_name,
|
||||
threshold=eval_metric.threshold,
|
||||
score=invocation_result.score,
|
||||
eval_status=invocation_result.eval_status,
|
||||
details=eval_metric_result_details,
|
||||
**eval_metric.model_dump(),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@@ -24,6 +24,7 @@ from .eval_metrics import PrebuiltMetrics
|
||||
from .evaluator import Evaluator
|
||||
from .final_response_match_v2 import FinalResponseMatchV2Evaluator
|
||||
from .response_evaluator import ResponseEvaluator
|
||||
from .rubric_based_final_response_quality_v1 import RubricBasedFinalResponseQualityV1Evaluator
|
||||
from .safety_evaluator import SafetyEvaluatorV1
|
||||
from .trajectory_evaluator import TrajectoryEvaluator
|
||||
|
||||
@@ -111,6 +112,10 @@ def _get_default_metric_evaluator_registry() -> MetricEvaluatorRegistry:
|
||||
metric_info=FinalResponseMatchV2Evaluator.get_metric_info(),
|
||||
evaluator=FinalResponseMatchV2Evaluator,
|
||||
)
|
||||
metric_evaluator_registry.register_evaluator(
|
||||
metric_info=RubricBasedFinalResponseQualityV1Evaluator.get_metric_info(),
|
||||
evaluator=RubricBasedFinalResponseQualityV1Evaluator,
|
||||
)
|
||||
|
||||
return metric_evaluator_registry
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,73 @@
|
||||
# Copyright 2025 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from google.adk.evaluation.app_details import AgentDetails
|
||||
from google.adk.evaluation.app_details import AppDetails
|
||||
from google.genai import types as genai_types
|
||||
from pytest import raises
|
||||
|
||||
|
||||
def test_get_developer_instructions_existing_agent():
|
||||
agent_details = {
|
||||
'agent1': AgentDetails(
|
||||
name='agent1', instructions='instruction for agent1'
|
||||
),
|
||||
'agent2': AgentDetails(
|
||||
name='agent2', instructions='instruction for agent2'
|
||||
),
|
||||
}
|
||||
app_details = AppDetails(
|
||||
agent_details=agent_details,
|
||||
)
|
||||
|
||||
# Test for existing agent
|
||||
instructions = app_details.get_developer_instructions('agent1')
|
||||
assert instructions == 'instruction for agent1'
|
||||
|
||||
|
||||
def test_get_developer_instructions_non_existing_Agent():
|
||||
agent_details = {
|
||||
'agent1': AgentDetails(
|
||||
name='agent1', instructions='instruction for agent1'
|
||||
),
|
||||
'agent2': AgentDetails(
|
||||
name='agent2', instructions='instruction for agent2'
|
||||
),
|
||||
}
|
||||
app_details = AppDetails(
|
||||
agent_details=agent_details,
|
||||
)
|
||||
|
||||
# Test for existing agent
|
||||
with raises(ValueError, match='`agent3` not found in the agentic system.'):
|
||||
app_details.get_developer_instructions('agent3')
|
||||
|
||||
|
||||
def test_get_tools_by_agent_name():
|
||||
tool1 = genai_types.Tool(
|
||||
function_declarations=[genai_types.FunctionDeclaration(name='tool1_func')]
|
||||
)
|
||||
agent_details = {
|
||||
'agent1': AgentDetails(name='agent1', tool_declarations=[tool1]),
|
||||
'agent2': AgentDetails(name='agent2', tool_declarations=[]),
|
||||
}
|
||||
app_details = AppDetails(
|
||||
agent_details=agent_details,
|
||||
)
|
||||
|
||||
tools = app_details.get_tools_by_agent_name()
|
||||
expected_tools = {'agent1': [tool1], 'agent2': []}
|
||||
assert tools == expected_tools
|
||||
@@ -15,6 +15,8 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from google.adk.evaluation.eval_case import get_all_tool_calls
|
||||
from google.adk.evaluation.eval_case import get_all_tool_calls_with_responses
|
||||
from google.adk.evaluation.eval_case import get_all_tool_responses
|
||||
from google.adk.evaluation.eval_case import IntermediateData
|
||||
from google.adk.evaluation.eval_case import InvocationEvent
|
||||
from google.adk.evaluation.eval_case import InvocationEvents
|
||||
@@ -97,3 +99,150 @@ def test_get_all_tool_calls_with_unsupported_type():
|
||||
ValueError, match='Unsupported type for intermediate_data'
|
||||
):
|
||||
get_all_tool_calls('this is not a valid type')
|
||||
|
||||
|
||||
def test_get_all_tool_responses_with_none_input():
|
||||
"""Tests that an empty list is returned when intermediate_data is None."""
|
||||
assert get_all_tool_responses(None) == []
|
||||
|
||||
|
||||
def test_get_all_tool_responses_with_empty_invocation_events():
|
||||
"""Tests InvocationEvents with an empty list of events."""
|
||||
intermediate_data = InvocationEvents(invocation_events=[])
|
||||
assert get_all_tool_responses(intermediate_data) == []
|
||||
|
||||
|
||||
def test_get_all_tool_responses_with_invocation_events_no_tools():
|
||||
"""Tests InvocationEvents containing events without any tool responses."""
|
||||
invocation_event = InvocationEvent(
|
||||
author='agent',
|
||||
content=genai_types.Content(
|
||||
parts=[genai_types.Part(text='Thinking...')], role='model'
|
||||
),
|
||||
)
|
||||
intermediate_data = InvocationEvents(invocation_events=[invocation_event])
|
||||
assert get_all_tool_responses(intermediate_data) == []
|
||||
|
||||
|
||||
def test_get_all_tool_responses_with_invocation_events():
|
||||
"""Tests that tool responses are correctly extracted from a InvocationEvents object."""
|
||||
tool_response1 = genai_types.FunctionResponse(
|
||||
name='search', response={'result': 'weather is good'}
|
||||
)
|
||||
tool_response2 = genai_types.FunctionResponse(
|
||||
name='lookup', response={'id': '123'}
|
||||
)
|
||||
invocation_event1 = InvocationEvent(
|
||||
author='agent1',
|
||||
content=genai_types.Content(
|
||||
parts=[genai_types.Part(function_response=tool_response1)],
|
||||
role='model',
|
||||
),
|
||||
)
|
||||
invocation_event2 = InvocationEvent(
|
||||
author='agent2',
|
||||
content=genai_types.Content(
|
||||
parts=[
|
||||
genai_types.Part(text='Found something.'),
|
||||
genai_types.Part(function_response=tool_response2),
|
||||
],
|
||||
role='model',
|
||||
),
|
||||
)
|
||||
intermediate_data = InvocationEvents(
|
||||
invocation_events=[invocation_event1, invocation_event2]
|
||||
)
|
||||
assert get_all_tool_responses(intermediate_data) == [
|
||||
tool_response1,
|
||||
tool_response2,
|
||||
]
|
||||
|
||||
|
||||
def test_get_all_tool_responses_with_unsupported_type():
|
||||
"""Tests that a ValueError is raised for unsupported intermediate_data types."""
|
||||
with pytest.raises(
|
||||
ValueError, match='Unsupported type for intermediate_data'
|
||||
):
|
||||
get_all_tool_responses('this is not a valid type')
|
||||
|
||||
|
||||
def test_get_all_tool_calls_with_responses_with_none_input():
|
||||
"""Tests that an empty list is returned when intermediate_data is None."""
|
||||
assert get_all_tool_calls_with_responses(None) == []
|
||||
|
||||
|
||||
def test_get_all_tool_calls_with_responses_with_intermediate_data_no_tool_calls():
|
||||
"""Tests get_all_tool_calls_with_responses with IntermediateData with no tool calls."""
|
||||
# No tool calls
|
||||
intermediate_data = IntermediateData(tool_uses=[], tool_responses=[])
|
||||
assert get_all_tool_calls_with_responses(intermediate_data) == []
|
||||
|
||||
|
||||
def test_get_all_tool_calls_with_responses_with_intermediate_data_with_tool_calls():
|
||||
"""Tests get_all_tool_calls_with_responses with IntermediateData with tools."""
|
||||
# With matching and non-matching tool calls
|
||||
tool_call1 = genai_types.FunctionCall(
|
||||
name='search', args={'query': 'weather'}, id='call1'
|
||||
)
|
||||
tool_response1 = genai_types.FunctionResponse(
|
||||
name='search', response={'result': 'sunny'}, id='call1'
|
||||
)
|
||||
tool_call2 = genai_types.FunctionCall(
|
||||
name='lookup', args={'id': '123'}, id='call2'
|
||||
)
|
||||
intermediate_data = IntermediateData(
|
||||
tool_uses=[tool_call1, tool_call2], tool_responses=[tool_response1]
|
||||
)
|
||||
assert get_all_tool_calls_with_responses(intermediate_data) == [
|
||||
(tool_call1, tool_response1),
|
||||
(tool_call2, None),
|
||||
]
|
||||
|
||||
|
||||
def test_get_all_tool_calls_with_responses_with_steps_no_tool_calls():
|
||||
"""Tests get_all_tool_calls_with_responses with Steps that don't have tool calls."""
|
||||
# No tool calls
|
||||
intermediate_data = InvocationEvents(invocation_events=[])
|
||||
assert get_all_tool_calls_with_responses(intermediate_data) == []
|
||||
|
||||
|
||||
def test_get_all_tool_calls_with_responses_with_invocation_events():
|
||||
"""Tests get_all_tool_calls_with_responses with InvocationEvents."""
|
||||
# No tools
|
||||
intermediate_data = InvocationEvents(invocation_events=[])
|
||||
assert get_all_tool_calls_with_responses(intermediate_data) == []
|
||||
|
||||
# With matching and non-matching tool calls
|
||||
tool_call1 = genai_types.FunctionCall(
|
||||
name='search', args={'query': 'weather'}, id='call1'
|
||||
)
|
||||
tool_response1 = genai_types.FunctionResponse(
|
||||
name='search', response={'result': 'sunny'}, id='call1'
|
||||
)
|
||||
tool_call2 = genai_types.FunctionCall(
|
||||
name='lookup', args={'id': '123'}, id='call2'
|
||||
)
|
||||
invocation_event1 = InvocationEvent(
|
||||
author='agent',
|
||||
content=genai_types.Content(
|
||||
parts=[
|
||||
genai_types.Part(function_call=tool_call1),
|
||||
genai_types.Part(function_call=tool_call2),
|
||||
],
|
||||
role='model',
|
||||
),
|
||||
)
|
||||
invocation_event2 = InvocationEvent(
|
||||
author='tool',
|
||||
content=genai_types.Content(
|
||||
parts=[genai_types.Part(function_response=tool_response1)],
|
||||
role='tool',
|
||||
),
|
||||
)
|
||||
intermediate_data = InvocationEvents(
|
||||
invocation_events=[invocation_event1, invocation_event2]
|
||||
)
|
||||
assert get_all_tool_calls_with_responses(intermediate_data) == [
|
||||
(tool_call1, tool_response1),
|
||||
(tool_call2, None),
|
||||
]
|
||||
|
||||
@@ -17,12 +17,13 @@ from __future__ import annotations
|
||||
from google.adk.evaluation.eval_case import Invocation
|
||||
from google.adk.evaluation.eval_metrics import BaseCriterion
|
||||
from google.adk.evaluation.eval_metrics import EvalMetric
|
||||
from google.adk.evaluation.eval_metrics import EvalStatus
|
||||
from google.adk.evaluation.eval_metrics import JudgeModelOptions
|
||||
from google.adk.evaluation.eval_metrics import PrebuiltMetrics
|
||||
from google.adk.evaluation.evaluator import EvalStatus
|
||||
from google.adk.evaluation.evaluator import PerInvocationResult
|
||||
from google.adk.evaluation.final_response_match_v2 import _parse_critique
|
||||
from google.adk.evaluation.final_response_match_v2 import FinalResponseMatchV2Evaluator
|
||||
from google.adk.evaluation.llm_as_judge import AutoRaterScore
|
||||
from google.adk.evaluation.llm_as_judge_utils import Label
|
||||
from google.adk.models.llm_response import LlmResponse
|
||||
from google.genai import types as genai_types
|
||||
@@ -206,8 +207,10 @@ def test_convert_auto_rater_response_to_score_valid():
|
||||
role="model",
|
||||
)
|
||||
)
|
||||
score = evaluator.convert_auto_rater_response_to_score(llm_response)
|
||||
assert score == 1.0
|
||||
auto_rater_score = evaluator.convert_auto_rater_response_to_score(
|
||||
llm_response
|
||||
)
|
||||
assert auto_rater_score == AutoRaterScore(score=1.0)
|
||||
|
||||
|
||||
def test_convert_auto_rater_response_to_score_invalid():
|
||||
@@ -224,8 +227,10 @@ def test_convert_auto_rater_response_to_score_invalid():
|
||||
role="model",
|
||||
)
|
||||
)
|
||||
score = evaluator.convert_auto_rater_response_to_score(llm_response)
|
||||
assert score == 0.0
|
||||
auto_rater_score = evaluator.convert_auto_rater_response_to_score(
|
||||
llm_response
|
||||
)
|
||||
assert auto_rater_score == AutoRaterScore(score=0.0)
|
||||
|
||||
|
||||
def test_convert_auto_rater_response_to_score_invalid_json():
|
||||
@@ -236,8 +241,10 @@ def test_convert_auto_rater_response_to_score_invalid_json():
|
||||
role="model",
|
||||
)
|
||||
)
|
||||
score = evaluator.convert_auto_rater_response_to_score(llm_response)
|
||||
assert score is None
|
||||
auto_rater_score = evaluator.convert_auto_rater_response_to_score(
|
||||
llm_response
|
||||
)
|
||||
assert auto_rater_score == AutoRaterScore()
|
||||
|
||||
|
||||
def test_convert_auto_rater_response_to_score_missing_key():
|
||||
@@ -248,8 +255,10 @@ def test_convert_auto_rater_response_to_score_missing_key():
|
||||
role="model",
|
||||
)
|
||||
)
|
||||
score = evaluator.convert_auto_rater_response_to_score(llm_response)
|
||||
assert score is None
|
||||
auto_rater_score = evaluator.convert_auto_rater_response_to_score(
|
||||
llm_response
|
||||
)
|
||||
assert auto_rater_score == AutoRaterScore()
|
||||
|
||||
|
||||
def test_aggregate_per_invocation_samples_none_evaluated():
|
||||
|
||||
@@ -24,6 +24,7 @@ from google.adk.evaluation.eval_metrics import LlmAsAJudgeCriterion
|
||||
from google.adk.evaluation.evaluator import EvalStatus
|
||||
from google.adk.evaluation.evaluator import EvaluationResult
|
||||
from google.adk.evaluation.evaluator import PerInvocationResult
|
||||
from google.adk.evaluation.llm_as_judge import AutoRaterScore
|
||||
from google.adk.evaluation.llm_as_judge import LlmAsJudge
|
||||
from google.adk.evaluation.llm_as_judge_utils import get_eval_status
|
||||
from google.adk.evaluation.llm_as_judge_utils import get_text_from_content
|
||||
@@ -41,8 +42,8 @@ class MockLlmAsJudge(LlmAsJudge):
|
||||
|
||||
def convert_auto_rater_response_to_score(
|
||||
self, llm_response: LlmResponse
|
||||
) -> Optional[float]:
|
||||
return 1.0
|
||||
) -> AutoRaterScore:
|
||||
return AutoRaterScore(score=1.0)
|
||||
|
||||
def aggregate_per_invocation_samples(
|
||||
self,
|
||||
|
||||
@@ -0,0 +1,290 @@
|
||||
# Copyright 2025 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
|
||||
from google.adk.evaluation.app_details import AgentDetails
|
||||
from google.adk.evaluation.app_details import AppDetails
|
||||
from google.adk.evaluation.eval_case import IntermediateData
|
||||
from google.adk.evaluation.eval_case import InvocationEvent
|
||||
from google.adk.evaluation.eval_case import InvocationEvents
|
||||
from google.adk.evaluation.eval_rubrics import RubricScore
|
||||
from google.adk.evaluation.evaluator import EvalStatus
|
||||
from google.adk.evaluation.llm_as_judge_utils import get_average_rubric_score
|
||||
from google.adk.evaluation.llm_as_judge_utils import get_eval_status
|
||||
from google.adk.evaluation.llm_as_judge_utils import get_text_from_content
|
||||
from google.adk.evaluation.llm_as_judge_utils import get_tool_calls_and_responses_as_json_str
|
||||
from google.adk.evaluation.llm_as_judge_utils import get_tool_declarations_as_json_str
|
||||
from google.genai import types as genai_types
|
||||
|
||||
|
||||
def test_get_text_from_content_with_none():
|
||||
"""Tests get_text_from_content with None as input."""
|
||||
assert get_text_from_content(None) is None
|
||||
|
||||
|
||||
def test_get_text_from_content_with_content_and_none_parts():
|
||||
"""Tests get_text_from_content with Content that has None for parts."""
|
||||
content = genai_types.Content(parts=None)
|
||||
assert get_text_from_content(content) is None
|
||||
|
||||
|
||||
def test_get_text_from_content_with_empty_parts():
|
||||
"""Tests get_text_from_content with an empty parts list."""
|
||||
content = genai_types.Content(parts=[])
|
||||
assert get_text_from_content(content) == None
|
||||
|
||||
|
||||
def test_get_text_from_content_with_parts_but_no_text():
|
||||
"""Tests get_text_from_content with parts that do not contain text."""
|
||||
content = genai_types.Content(
|
||||
parts=[
|
||||
genai_types.Part(
|
||||
function_call=genai_types.FunctionCall(name="test_func")
|
||||
)
|
||||
]
|
||||
)
|
||||
assert get_text_from_content(content) == ""
|
||||
|
||||
|
||||
def test_get_text_from_content_with_single_text_part():
|
||||
"""Tests get_text_from_content with a single text part."""
|
||||
content = genai_types.Content(parts=[genai_types.Part(text="Hello")])
|
||||
assert get_text_from_content(content) == "Hello"
|
||||
|
||||
|
||||
def test_get_text_from_content_with_multiple_text_parts():
|
||||
"""Tests get_text_from_content with multiple text parts."""
|
||||
content = genai_types.Content(
|
||||
parts=[genai_types.Part(text="Hello"), genai_types.Part(text="World")]
|
||||
)
|
||||
assert get_text_from_content(content) == "Hello\nWorld"
|
||||
|
||||
|
||||
def test_get_text_from_content_with_mixed_parts():
|
||||
"""Tests get_text_from_content with a mix of text and non-text parts."""
|
||||
content = genai_types.Content(
|
||||
parts=[
|
||||
genai_types.Part(text="Hello"),
|
||||
genai_types.Part(
|
||||
function_call=genai_types.FunctionCall(name="test_func")
|
||||
),
|
||||
genai_types.Part(text="World"),
|
||||
]
|
||||
)
|
||||
assert get_text_from_content(content) == "Hello\nWorld"
|
||||
|
||||
|
||||
def test_get_eval_status_with_none_score():
|
||||
"""Tests get_eval_status returns NOT_EVALUATED for a None score."""
|
||||
assert get_eval_status(score=None, threshold=0.5) == EvalStatus.NOT_EVALUATED
|
||||
|
||||
|
||||
def test_get_eval_status_when_score_is_greater_than_threshold():
|
||||
"""Tests get_eval_status returns PASSED when score > threshold."""
|
||||
assert get_eval_status(score=0.8, threshold=0.5) == EvalStatus.PASSED
|
||||
|
||||
|
||||
def test_get_eval_status_when_score_is_equal_to_threshold():
|
||||
"""Tests get_eval_status returns PASSED when score == threshold."""
|
||||
assert get_eval_status(score=0.5, threshold=0.5) == EvalStatus.PASSED
|
||||
|
||||
|
||||
def test_get_eval_status_when_score_is_less_than_threshold():
|
||||
"""Tests get_eval_status returns FAILED when score < threshold."""
|
||||
assert get_eval_status(score=0.4, threshold=0.5) == EvalStatus.FAILED
|
||||
|
||||
|
||||
def test_get_average_rubric_score_with_empty_list():
|
||||
"""Tests get_average_rubric_score returns None for an empty list."""
|
||||
assert get_average_rubric_score([]) is None
|
||||
|
||||
|
||||
def test_get_average_rubric_score_with_all_none_scores():
|
||||
"""Tests get_average_rubric_score returns None when all scores are None."""
|
||||
rubric_scores = [
|
||||
RubricScore(rubric_id="1", score=None),
|
||||
RubricScore(rubric_id="2", score=None),
|
||||
]
|
||||
assert get_average_rubric_score(rubric_scores) is None
|
||||
|
||||
|
||||
def test_get_average_rubric_score_with_single_score():
|
||||
"""Tests get_average_rubric_score with a single valid score."""
|
||||
rubric_scores = [RubricScore(rubric_id="1", score=0.8)]
|
||||
assert get_average_rubric_score(rubric_scores) == 0.8
|
||||
|
||||
|
||||
def test_get_average_rubric_score_with_multiple_scores():
|
||||
"""Tests get_average_rubric_score with multiple valid scores."""
|
||||
rubric_scores = [
|
||||
RubricScore(rubric_id="1", score=0.8),
|
||||
RubricScore(rubric_id="2", score=0.6),
|
||||
]
|
||||
assert get_average_rubric_score(rubric_scores) == 0.7
|
||||
|
||||
|
||||
def test_get_average_rubric_score_with_mixed_scores():
|
||||
"""Tests get_average_rubric_score with a mix of valid and None scores."""
|
||||
rubric_scores = [
|
||||
RubricScore(rubric_id="1", score=0.8),
|
||||
RubricScore(rubric_id="2", score=None),
|
||||
RubricScore(rubric_id="3", score=0.6),
|
||||
]
|
||||
assert get_average_rubric_score(rubric_scores) == 0.7
|
||||
|
||||
|
||||
def test_get_tool_declarations_as_json_str_with_no_agents():
|
||||
"""Tests get_tool_declarations_as_json_str with no agents."""
|
||||
app_details = AppDetails(agent_details={})
|
||||
expected_json = {"tool_declarations": {}}
|
||||
actual_json_str = get_tool_declarations_as_json_str(app_details)
|
||||
assert json.loads(actual_json_str) == expected_json
|
||||
|
||||
|
||||
def test_get_tool_declarations_as_json_str_with_agent_no_tools():
|
||||
"""Tests get_tool_declarations_as_json_str with an agent that has no tools."""
|
||||
agent_details = {"agent1": AgentDetails(name="agent1", tool_declarations=[])}
|
||||
app_details = AppDetails(agent_details=agent_details)
|
||||
expected_json = {"tool_declarations": {"agent1": []}}
|
||||
actual_json_str = get_tool_declarations_as_json_str(app_details)
|
||||
assert json.loads(actual_json_str) == expected_json
|
||||
|
||||
|
||||
def test_get_tool_declarations_as_json_str_with_agent_with_tools():
|
||||
"""Tests get_tool_declarations_as_json_str with an agent that has tools."""
|
||||
tool1 = genai_types.Tool(
|
||||
function_declarations=[
|
||||
genai_types.FunctionDeclaration(
|
||||
name="test_func", description="A test function."
|
||||
)
|
||||
]
|
||||
)
|
||||
agent_details = {
|
||||
"agent1": AgentDetails(name="agent1", tool_declarations=[tool1])
|
||||
}
|
||||
app_details = AppDetails(agent_details=agent_details)
|
||||
expected_json = {
|
||||
"tool_declarations": {
|
||||
"agent1": [{
|
||||
"function_declarations": [{
|
||||
"name": "test_func",
|
||||
"description": "A test function.",
|
||||
}]
|
||||
}]
|
||||
}
|
||||
}
|
||||
actual_json_str = get_tool_declarations_as_json_str(app_details)
|
||||
assert json.loads(actual_json_str) == expected_json
|
||||
|
||||
|
||||
def test_get_tool_declarations_as_json_str_with_multiple_agents():
|
||||
"""Tests get_tool_declarations_as_json_str with multiple agents."""
|
||||
tool1 = genai_types.Tool(
|
||||
function_declarations=[
|
||||
genai_types.FunctionDeclaration(
|
||||
name="test_func1", description="A test function 1."
|
||||
)
|
||||
]
|
||||
)
|
||||
agent_details = {
|
||||
"agent1": AgentDetails(name="agent1", tool_declarations=[tool1]),
|
||||
"agent2": AgentDetails(name="agent2", tool_declarations=[]),
|
||||
}
|
||||
app_details = AppDetails(agent_details=agent_details)
|
||||
expected_json = {
|
||||
"tool_declarations": {
|
||||
"agent1": [{
|
||||
"function_declarations": [{
|
||||
"name": "test_func1",
|
||||
"description": "A test function 1.",
|
||||
}]
|
||||
}],
|
||||
"agent2": [],
|
||||
}
|
||||
}
|
||||
actual_json_str = get_tool_declarations_as_json_str(app_details)
|
||||
assert json.loads(actual_json_str) == expected_json
|
||||
|
||||
|
||||
def test_get_tool_calls_and_responses_as_json_str_with_none():
|
||||
"""Tests get_tool_calls_and_responses_as_json_str with None."""
|
||||
assert (
|
||||
get_tool_calls_and_responses_as_json_str(None)
|
||||
== "No intermediate steps were taken."
|
||||
)
|
||||
|
||||
|
||||
def test_get_tool_calls_and_responses_as_json_str_with_intermediate_data_no_tools():
|
||||
"""Tests get_tool_calls_and_responses_as_json_str with IntermediateData and no tools."""
|
||||
intermediate_data = IntermediateData(tool_uses=[], tool_responses=[])
|
||||
assert (
|
||||
get_tool_calls_and_responses_as_json_str(intermediate_data)
|
||||
== "No intermediate steps were taken."
|
||||
)
|
||||
|
||||
intermediate_data = InvocationEvents(invocation_events=[])
|
||||
assert (
|
||||
get_tool_calls_and_responses_as_json_str(intermediate_data)
|
||||
== "No intermediate steps were taken."
|
||||
)
|
||||
|
||||
|
||||
def test_get_tool_calls_and_responses_as_json_str_with_invocation_events_multiple_calls():
|
||||
"""Tests get_tool_calls_and_responses_as_json_str with multiple calls in InvocationEvents."""
|
||||
tool_call1 = genai_types.FunctionCall(name="func1", args={}, id="call1")
|
||||
tool_call2 = genai_types.FunctionCall(name="func2", args={}, id="call2")
|
||||
tool_response1 = genai_types.FunctionResponse(
|
||||
name="func1", response={"status": "ok"}, id="call1"
|
||||
)
|
||||
invocation_event1 = InvocationEvent(
|
||||
author="agent",
|
||||
content=genai_types.Content(
|
||||
parts=[
|
||||
genai_types.Part(function_call=tool_call1),
|
||||
genai_types.Part(function_call=tool_call2),
|
||||
]
|
||||
),
|
||||
)
|
||||
invocation_event2 = InvocationEvent(
|
||||
author="tool",
|
||||
content=genai_types.Content(
|
||||
parts=[genai_types.Part(function_response=tool_response1)]
|
||||
),
|
||||
)
|
||||
intermediate_data = InvocationEvents(
|
||||
invocation_events=[invocation_event1, invocation_event2]
|
||||
)
|
||||
json_str = get_tool_calls_and_responses_as_json_str(intermediate_data)
|
||||
expected_json = {
|
||||
"tool_calls_and_response": [
|
||||
{
|
||||
"step": 0,
|
||||
"tool_call": {"name": "func1", "args": {}, "id": "call1"},
|
||||
"tool_response": {
|
||||
"name": "func1",
|
||||
"response": {"status": "ok"},
|
||||
"id": "call1",
|
||||
},
|
||||
},
|
||||
{
|
||||
"step": 1,
|
||||
"tool_call": {"name": "func2", "args": {}, "id": "call2"},
|
||||
"tool_response": "None",
|
||||
},
|
||||
]
|
||||
}
|
||||
assert json.loads(json_str) == expected_json
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user