feat: Adds Rubric based final response evaluator

The evaluator uses a set of rubrics to assess the quality of the agent's final response.

PiperOrigin-RevId: 811154498
This commit is contained in:
Ankur Sharma
2025-09-24 20:30:09 -07:00
committed by Copybara-Service
parent 01923a9227
commit 5a485b01cd
16 changed files with 1969 additions and 24 deletions
+14
View File
@@ -47,3 +47,17 @@ class AppDetails(EvalBaseModel):
default_factory=dict,
)
"""A mapping from the agent name to the details of that agent."""
def get_developer_instructions(self, agent_name: str) -> str:
"""Returns a string containing the developer instructions."""
if agent_name not in self.agent_details:
raise ValueError(f"`{agent_name}` not found in the agentic system.")
return self.agent_details[agent_name].instructions
def get_tools_by_agent_name(self) -> dict[str, genai_types.ToolListUnion]:
"""Returns a dictionary of tools available to an agent in the App, keyed to the name of the Agent."""
return {
name: details.tool_declarations
for name, details in self.agent_details.items()
}
+51
View File
@@ -168,3 +168,54 @@ def get_all_tool_calls(
)
return tool_calls
def get_all_tool_responses(
intermediate_data: Optional[IntermediateDataType],
) -> list[genai_types.FunctionResponse]:
"""A utility method to retrieve tools responses from intermediate data."""
if not intermediate_data:
return []
tool_responses = []
if isinstance(intermediate_data, IntermediateData):
tool_responses = intermediate_data.tool_responses
elif isinstance(intermediate_data, InvocationEvents):
# Go over each event in the list of events
for invocation_event in intermediate_data.invocation_events:
# Check if the event has content and some parts.
if invocation_event.content and invocation_event.content.parts:
for p in invocation_event.content.parts:
# For each part, we check if any of those part is a function response.
if p.function_response:
tool_responses.append(p.function_response)
else:
raise ValueError(
f"Unsupported type for intermediate_data `{intermediate_data}`"
)
return tool_responses
ToolCallAndResponse: TypeAlias = tuple[
genai_types.FunctionCall, Optional[genai_types.FunctionResponse]
]
"""A Tuple representing a Function call and corresponding optional function response."""
def get_all_tool_calls_with_responses(
intermediate_data: Optional[IntermediateDataType],
) -> list[ToolCallAndResponse]:
"""Returns tool calls with the corresponding responses, if available."""
tool_responses_by_call_id: dict[str, genai_types.FunctionResponse] = {
tool_response.id: tool_response
for tool_response in get_all_tool_responses(intermediate_data)
}
tool_call_and_responses: list[ToolCallAndResponse] = []
for tool_call in get_all_tool_calls(intermediate_data):
response = tool_responses_by_call_id.get(tool_call.id, None)
tool_call_and_responses.append((tool_call, response))
return tool_call_and_responses
@@ -48,6 +48,10 @@ class PrebuiltMetrics(Enum):
FINAL_RESPONSE_MATCH_V2 = "final_response_match_v2"
RUBRIC_BASED_FINAL_RESPONSE_QUALITY_V1 = (
"rubric_based_final_response_quality_v1"
)
MetricName: TypeAlias = Union[str, PrebuiltMetrics]
Threshold: TypeAlias = float
+6
View File
@@ -23,6 +23,7 @@ from typing_extensions import TypeAlias
from .eval_case import Invocation
from .eval_metrics import BaseCriterion
from .eval_metrics import EvalStatus
from .eval_rubrics import RubricScore
# Redefining the type here for backward compatibility.
EvalStatus: TypeAlias = EvalStatus
@@ -35,6 +36,7 @@ class PerInvocationResult(BaseModel):
expected_invocation: Invocation
score: Optional[float] = None
eval_status: EvalStatus = EvalStatus.NOT_EVALUATED
rubric_scores: Optional[list[RubricScore]] = None
class EvaluationResult(BaseModel):
@@ -45,6 +47,10 @@ class EvaluationResult(BaseModel):
"""Overall status, based on each invocation."""
per_invocation_results: list[PerInvocationResult] = []
"""Detailed results per invocation."""
overall_rubric_scores: Optional[list[RubricScore]] = None
"""Overall rubric, based on each invocation."""
class Evaluator(ABC):
@@ -33,6 +33,7 @@ from .eval_metrics import MetricValueInfo
from .eval_metrics import PrebuiltMetrics
from .evaluator import EvaluationResult
from .evaluator import PerInvocationResult
from .llm_as_judge import AutoRaterScore
from .llm_as_judge import LlmAsJudge
from .llm_as_judge_utils import get_eval_status
from .llm_as_judge_utils import get_text_from_content
@@ -179,17 +180,17 @@ class FinalResponseMatchV2Evaluator(LlmAsJudge):
@override
def convert_auto_rater_response_to_score(
self, llm_response: LlmResponse
) -> Optional[float]:
) -> AutoRaterScore:
response_text = get_text_from_content(llm_response.content)
if response_text is None:
return None
return AutoRaterScore()
label = _parse_critique(response_text)
if label == Label.VALID:
return 1.0
return AutoRaterScore(score=1.0)
elif label == Label.INVALID:
return 0.0
return AutoRaterScore(score=0.0)
else:
return None
return AutoRaterScore()
@override
def aggregate_per_invocation_samples(
+14 -4
View File
@@ -26,15 +26,22 @@ from ..models.llm_request import LlmRequest
from ..models.llm_response import LlmResponse
from ..models.registry import LLMRegistry
from ..utils.context_utils import Aclosing
from .common import EvalBaseModel
from .eval_case import Invocation
from .eval_metrics import BaseCriterion
from .eval_metrics import EvalMetric
from .eval_metrics import RubricScore
from .evaluator import EvaluationResult
from .evaluator import Evaluator
from .evaluator import PerInvocationResult
from .llm_as_judge_utils import get_eval_status
class AutoRaterScore(EvalBaseModel):
score: Optional[float] = None
rubric_scores: Optional[list[RubricScore]] = None
class LlmAsJudge(Evaluator):
"""Evaluator based on a LLM.
@@ -82,7 +89,7 @@ class LlmAsJudge(Evaluator):
@abstractmethod
def convert_auto_rater_response_to_score(
self, auto_rater_response: LlmResponse
) -> Optional[float]:
) -> AutoRaterScore:
"""Parses auto_rater_response and returns the corresponding score, or None if the score cannot be determined."""
@abstractmethod
@@ -126,15 +133,18 @@ class LlmAsJudge(Evaluator):
) as agen:
async for llm_response in agen:
# Non-streaming call, so there is only one response content.
score = self.convert_auto_rater_response_to_score(llm_response)
auto_rater_score = self.convert_auto_rater_response_to_score(
llm_response
)
invocation_result_samples.append(
PerInvocationResult(
actual_invocation=actual,
expected_invocation=expected,
score=score,
score=auto_rater_score.score,
eval_status=get_eval_status(
score, self._criterion.threshold
auto_rater_score.score, self._eval_metric.threshold
),
rubric_scores=auto_rater_score.rubric_scores,
)
)
if not invocation_result_samples:
@@ -15,10 +15,17 @@
from __future__ import annotations
import enum
import statistics
from typing import Optional
from typing import Union
from google.genai import types as genai_types
from .app_details import AppDetails
from .common import EvalBaseModel
from .eval_case import get_all_tool_calls_with_responses
from .eval_case import IntermediateDataType
from .eval_metrics import RubricScore
from .evaluator import EvalStatus
@@ -46,3 +53,97 @@ def get_eval_status(score: Optional[float], threshold: float) -> EvalStatus:
if score is None:
return EvalStatus.NOT_EVALUATED
return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED
def get_average_rubric_score(
rubric_scores: list[RubricScore],
) -> Optional[float]:
"""Returns a single score value from the given list of rubric scores.
It is possible that none of the rubric score actually contain a score value,
if that happens then None is returned.
If non-zero score values are present, then a mean value is returned as the
aggregated value.
"""
rubric_scores = [
rubric_score.score
for rubric_score in rubric_scores
if rubric_score.score is not None
]
return statistics.mean(rubric_scores) if rubric_scores else None
class _ToolDeclarations(EvalBaseModel):
"""Internal data model used for serializing Tool declarations."""
tool_declarations: dict[str, genai_types.ToolListUnion]
def get_tool_declarations_as_json_str(
app_details: AppDetails,
) -> str:
"""Returns a JSON string representation of Tool declarations.
The output of this method is usually intended to be sent to the LLM.
"""
tool_declarations = _ToolDeclarations(
tool_declarations=app_details.get_tools_by_agent_name()
)
return tool_declarations.model_dump_json(
indent=2,
exclude_unset=True,
exclude_defaults=True,
exclude_none=True,
)
class _ToolCallAndResponse(EvalBaseModel):
"""Internal data model to capture one single tool call and response."""
step: int
tool_call: genai_types.FunctionCall
tool_response: Union[genai_types.FunctionResponse, str]
class _ToolCallsAndResponses(EvalBaseModel):
"""Internal data model used for serializing Tool call and responses."""
tool_calls_and_response: list[_ToolCallAndResponse]
def get_tool_calls_and_responses_as_json_str(
intermediate_data: Optional[IntermediateDataType],
) -> str:
"""Returns a JSON string representation of tool calls and corresponding responses.
The output of this method is usually intended to be sent to the LLM.
"""
raw_tool_calls_and_response = get_all_tool_calls_with_responses(
intermediate_data
)
if not raw_tool_calls_and_response:
return "No intermediate steps were taken."
tool_calls_and_responses = []
for idx, (tool_call, tool_response) in enumerate(raw_tool_calls_and_response):
tool_calls_and_responses.append(
_ToolCallAndResponse(
step=idx,
tool_call=tool_call,
tool_response=tool_response if tool_response else "None",
)
)
internal_tool_calls_and_responses = _ToolCallsAndResponses(
tool_calls_and_response=tool_calls_and_responses
)
return internal_tool_calls_and_responses.model_dump_json(
indent=2,
exclude_unset=True,
exclude_defaults=True,
exclude_none=True,
)
@@ -40,6 +40,7 @@ from .base_eval_service import InferenceStatus
from .eval_case import Invocation
from .eval_metrics import EvalMetric
from .eval_metrics import EvalMetricResult
from .eval_metrics import EvalMetricResultDetails
from .eval_metrics import EvalMetricResultPerInvocation
from .eval_result import EvalCaseResult
from .eval_set import EvalCase
@@ -239,12 +240,15 @@ class LocalEvalService(BaseEvalService):
)
# Track overall scrore across all invocations.
eval_metric_result_details = EvalMetricResultDetails(
rubric_scores=evaluation_result.overall_rubric_scores
)
overall_eval_metric_results.append(
EvalMetricResult(
metric_name=eval_metric.metric_name,
threshold=eval_metric.threshold,
score=evaluation_result.overall_score,
eval_status=evaluation_result.overall_eval_status,
details=eval_metric_result_details,
**eval_metric.model_dump(),
)
)
@@ -262,12 +266,15 @@ class LocalEvalService(BaseEvalService):
evaluation_result.per_invocation_results,
eval_metric_result_per_invocation,
):
eval_metric_result_details = EvalMetricResultDetails(
rubric_scores=invocation_result.rubric_scores
)
invocation.eval_metric_results.append(
EvalMetricResult(
metric_name=eval_metric.metric_name,
threshold=eval_metric.threshold,
score=invocation_result.score,
eval_status=invocation_result.eval_status,
details=eval_metric_result_details,
**eval_metric.model_dump(),
)
)
@@ -24,6 +24,7 @@ from .eval_metrics import PrebuiltMetrics
from .evaluator import Evaluator
from .final_response_match_v2 import FinalResponseMatchV2Evaluator
from .response_evaluator import ResponseEvaluator
from .rubric_based_final_response_quality_v1 import RubricBasedFinalResponseQualityV1Evaluator
from .safety_evaluator import SafetyEvaluatorV1
from .trajectory_evaluator import TrajectoryEvaluator
@@ -111,6 +112,10 @@ def _get_default_metric_evaluator_registry() -> MetricEvaluatorRegistry:
metric_info=FinalResponseMatchV2Evaluator.get_metric_info(),
evaluator=FinalResponseMatchV2Evaluator,
)
metric_evaluator_registry.register_evaluator(
metric_info=RubricBasedFinalResponseQualityV1Evaluator.get_metric_info(),
evaluator=RubricBasedFinalResponseQualityV1Evaluator,
)
return metric_evaluator_registry
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,73 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
from google.adk.evaluation.app_details import AgentDetails
from google.adk.evaluation.app_details import AppDetails
from google.genai import types as genai_types
from pytest import raises
def test_get_developer_instructions_existing_agent():
agent_details = {
'agent1': AgentDetails(
name='agent1', instructions='instruction for agent1'
),
'agent2': AgentDetails(
name='agent2', instructions='instruction for agent2'
),
}
app_details = AppDetails(
agent_details=agent_details,
)
# Test for existing agent
instructions = app_details.get_developer_instructions('agent1')
assert instructions == 'instruction for agent1'
def test_get_developer_instructions_non_existing_Agent():
agent_details = {
'agent1': AgentDetails(
name='agent1', instructions='instruction for agent1'
),
'agent2': AgentDetails(
name='agent2', instructions='instruction for agent2'
),
}
app_details = AppDetails(
agent_details=agent_details,
)
# Test for existing agent
with raises(ValueError, match='`agent3` not found in the agentic system.'):
app_details.get_developer_instructions('agent3')
def test_get_tools_by_agent_name():
tool1 = genai_types.Tool(
function_declarations=[genai_types.FunctionDeclaration(name='tool1_func')]
)
agent_details = {
'agent1': AgentDetails(name='agent1', tool_declarations=[tool1]),
'agent2': AgentDetails(name='agent2', tool_declarations=[]),
}
app_details = AppDetails(
agent_details=agent_details,
)
tools = app_details.get_tools_by_agent_name()
expected_tools = {'agent1': [tool1], 'agent2': []}
assert tools == expected_tools
@@ -15,6 +15,8 @@
from __future__ import annotations
from google.adk.evaluation.eval_case import get_all_tool_calls
from google.adk.evaluation.eval_case import get_all_tool_calls_with_responses
from google.adk.evaluation.eval_case import get_all_tool_responses
from google.adk.evaluation.eval_case import IntermediateData
from google.adk.evaluation.eval_case import InvocationEvent
from google.adk.evaluation.eval_case import InvocationEvents
@@ -97,3 +99,150 @@ def test_get_all_tool_calls_with_unsupported_type():
ValueError, match='Unsupported type for intermediate_data'
):
get_all_tool_calls('this is not a valid type')
def test_get_all_tool_responses_with_none_input():
"""Tests that an empty list is returned when intermediate_data is None."""
assert get_all_tool_responses(None) == []
def test_get_all_tool_responses_with_empty_invocation_events():
"""Tests InvocationEvents with an empty list of events."""
intermediate_data = InvocationEvents(invocation_events=[])
assert get_all_tool_responses(intermediate_data) == []
def test_get_all_tool_responses_with_invocation_events_no_tools():
"""Tests InvocationEvents containing events without any tool responses."""
invocation_event = InvocationEvent(
author='agent',
content=genai_types.Content(
parts=[genai_types.Part(text='Thinking...')], role='model'
),
)
intermediate_data = InvocationEvents(invocation_events=[invocation_event])
assert get_all_tool_responses(intermediate_data) == []
def test_get_all_tool_responses_with_invocation_events():
"""Tests that tool responses are correctly extracted from a InvocationEvents object."""
tool_response1 = genai_types.FunctionResponse(
name='search', response={'result': 'weather is good'}
)
tool_response2 = genai_types.FunctionResponse(
name='lookup', response={'id': '123'}
)
invocation_event1 = InvocationEvent(
author='agent1',
content=genai_types.Content(
parts=[genai_types.Part(function_response=tool_response1)],
role='model',
),
)
invocation_event2 = InvocationEvent(
author='agent2',
content=genai_types.Content(
parts=[
genai_types.Part(text='Found something.'),
genai_types.Part(function_response=tool_response2),
],
role='model',
),
)
intermediate_data = InvocationEvents(
invocation_events=[invocation_event1, invocation_event2]
)
assert get_all_tool_responses(intermediate_data) == [
tool_response1,
tool_response2,
]
def test_get_all_tool_responses_with_unsupported_type():
"""Tests that a ValueError is raised for unsupported intermediate_data types."""
with pytest.raises(
ValueError, match='Unsupported type for intermediate_data'
):
get_all_tool_responses('this is not a valid type')
def test_get_all_tool_calls_with_responses_with_none_input():
"""Tests that an empty list is returned when intermediate_data is None."""
assert get_all_tool_calls_with_responses(None) == []
def test_get_all_tool_calls_with_responses_with_intermediate_data_no_tool_calls():
"""Tests get_all_tool_calls_with_responses with IntermediateData with no tool calls."""
# No tool calls
intermediate_data = IntermediateData(tool_uses=[], tool_responses=[])
assert get_all_tool_calls_with_responses(intermediate_data) == []
def test_get_all_tool_calls_with_responses_with_intermediate_data_with_tool_calls():
"""Tests get_all_tool_calls_with_responses with IntermediateData with tools."""
# With matching and non-matching tool calls
tool_call1 = genai_types.FunctionCall(
name='search', args={'query': 'weather'}, id='call1'
)
tool_response1 = genai_types.FunctionResponse(
name='search', response={'result': 'sunny'}, id='call1'
)
tool_call2 = genai_types.FunctionCall(
name='lookup', args={'id': '123'}, id='call2'
)
intermediate_data = IntermediateData(
tool_uses=[tool_call1, tool_call2], tool_responses=[tool_response1]
)
assert get_all_tool_calls_with_responses(intermediate_data) == [
(tool_call1, tool_response1),
(tool_call2, None),
]
def test_get_all_tool_calls_with_responses_with_steps_no_tool_calls():
"""Tests get_all_tool_calls_with_responses with Steps that don't have tool calls."""
# No tool calls
intermediate_data = InvocationEvents(invocation_events=[])
assert get_all_tool_calls_with_responses(intermediate_data) == []
def test_get_all_tool_calls_with_responses_with_invocation_events():
"""Tests get_all_tool_calls_with_responses with InvocationEvents."""
# No tools
intermediate_data = InvocationEvents(invocation_events=[])
assert get_all_tool_calls_with_responses(intermediate_data) == []
# With matching and non-matching tool calls
tool_call1 = genai_types.FunctionCall(
name='search', args={'query': 'weather'}, id='call1'
)
tool_response1 = genai_types.FunctionResponse(
name='search', response={'result': 'sunny'}, id='call1'
)
tool_call2 = genai_types.FunctionCall(
name='lookup', args={'id': '123'}, id='call2'
)
invocation_event1 = InvocationEvent(
author='agent',
content=genai_types.Content(
parts=[
genai_types.Part(function_call=tool_call1),
genai_types.Part(function_call=tool_call2),
],
role='model',
),
)
invocation_event2 = InvocationEvent(
author='tool',
content=genai_types.Content(
parts=[genai_types.Part(function_response=tool_response1)],
role='tool',
),
)
intermediate_data = InvocationEvents(
invocation_events=[invocation_event1, invocation_event2]
)
assert get_all_tool_calls_with_responses(intermediate_data) == [
(tool_call1, tool_response1),
(tool_call2, None),
]
@@ -17,12 +17,13 @@ from __future__ import annotations
from google.adk.evaluation.eval_case import Invocation
from google.adk.evaluation.eval_metrics import BaseCriterion
from google.adk.evaluation.eval_metrics import EvalMetric
from google.adk.evaluation.eval_metrics import EvalStatus
from google.adk.evaluation.eval_metrics import JudgeModelOptions
from google.adk.evaluation.eval_metrics import PrebuiltMetrics
from google.adk.evaluation.evaluator import EvalStatus
from google.adk.evaluation.evaluator import PerInvocationResult
from google.adk.evaluation.final_response_match_v2 import _parse_critique
from google.adk.evaluation.final_response_match_v2 import FinalResponseMatchV2Evaluator
from google.adk.evaluation.llm_as_judge import AutoRaterScore
from google.adk.evaluation.llm_as_judge_utils import Label
from google.adk.models.llm_response import LlmResponse
from google.genai import types as genai_types
@@ -206,8 +207,10 @@ def test_convert_auto_rater_response_to_score_valid():
role="model",
)
)
score = evaluator.convert_auto_rater_response_to_score(llm_response)
assert score == 1.0
auto_rater_score = evaluator.convert_auto_rater_response_to_score(
llm_response
)
assert auto_rater_score == AutoRaterScore(score=1.0)
def test_convert_auto_rater_response_to_score_invalid():
@@ -224,8 +227,10 @@ def test_convert_auto_rater_response_to_score_invalid():
role="model",
)
)
score = evaluator.convert_auto_rater_response_to_score(llm_response)
assert score == 0.0
auto_rater_score = evaluator.convert_auto_rater_response_to_score(
llm_response
)
assert auto_rater_score == AutoRaterScore(score=0.0)
def test_convert_auto_rater_response_to_score_invalid_json():
@@ -236,8 +241,10 @@ def test_convert_auto_rater_response_to_score_invalid_json():
role="model",
)
)
score = evaluator.convert_auto_rater_response_to_score(llm_response)
assert score is None
auto_rater_score = evaluator.convert_auto_rater_response_to_score(
llm_response
)
assert auto_rater_score == AutoRaterScore()
def test_convert_auto_rater_response_to_score_missing_key():
@@ -248,8 +255,10 @@ def test_convert_auto_rater_response_to_score_missing_key():
role="model",
)
)
score = evaluator.convert_auto_rater_response_to_score(llm_response)
assert score is None
auto_rater_score = evaluator.convert_auto_rater_response_to_score(
llm_response
)
assert auto_rater_score == AutoRaterScore()
def test_aggregate_per_invocation_samples_none_evaluated():
@@ -24,6 +24,7 @@ from google.adk.evaluation.eval_metrics import LlmAsAJudgeCriterion
from google.adk.evaluation.evaluator import EvalStatus
from google.adk.evaluation.evaluator import EvaluationResult
from google.adk.evaluation.evaluator import PerInvocationResult
from google.adk.evaluation.llm_as_judge import AutoRaterScore
from google.adk.evaluation.llm_as_judge import LlmAsJudge
from google.adk.evaluation.llm_as_judge_utils import get_eval_status
from google.adk.evaluation.llm_as_judge_utils import get_text_from_content
@@ -41,8 +42,8 @@ class MockLlmAsJudge(LlmAsJudge):
def convert_auto_rater_response_to_score(
self, llm_response: LlmResponse
) -> Optional[float]:
return 1.0
) -> AutoRaterScore:
return AutoRaterScore(score=1.0)
def aggregate_per_invocation_samples(
self,
@@ -0,0 +1,290 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import json
from google.adk.evaluation.app_details import AgentDetails
from google.adk.evaluation.app_details import AppDetails
from google.adk.evaluation.eval_case import IntermediateData
from google.adk.evaluation.eval_case import InvocationEvent
from google.adk.evaluation.eval_case import InvocationEvents
from google.adk.evaluation.eval_rubrics import RubricScore
from google.adk.evaluation.evaluator import EvalStatus
from google.adk.evaluation.llm_as_judge_utils import get_average_rubric_score
from google.adk.evaluation.llm_as_judge_utils import get_eval_status
from google.adk.evaluation.llm_as_judge_utils import get_text_from_content
from google.adk.evaluation.llm_as_judge_utils import get_tool_calls_and_responses_as_json_str
from google.adk.evaluation.llm_as_judge_utils import get_tool_declarations_as_json_str
from google.genai import types as genai_types
def test_get_text_from_content_with_none():
"""Tests get_text_from_content with None as input."""
assert get_text_from_content(None) is None
def test_get_text_from_content_with_content_and_none_parts():
"""Tests get_text_from_content with Content that has None for parts."""
content = genai_types.Content(parts=None)
assert get_text_from_content(content) is None
def test_get_text_from_content_with_empty_parts():
"""Tests get_text_from_content with an empty parts list."""
content = genai_types.Content(parts=[])
assert get_text_from_content(content) == None
def test_get_text_from_content_with_parts_but_no_text():
"""Tests get_text_from_content with parts that do not contain text."""
content = genai_types.Content(
parts=[
genai_types.Part(
function_call=genai_types.FunctionCall(name="test_func")
)
]
)
assert get_text_from_content(content) == ""
def test_get_text_from_content_with_single_text_part():
"""Tests get_text_from_content with a single text part."""
content = genai_types.Content(parts=[genai_types.Part(text="Hello")])
assert get_text_from_content(content) == "Hello"
def test_get_text_from_content_with_multiple_text_parts():
"""Tests get_text_from_content with multiple text parts."""
content = genai_types.Content(
parts=[genai_types.Part(text="Hello"), genai_types.Part(text="World")]
)
assert get_text_from_content(content) == "Hello\nWorld"
def test_get_text_from_content_with_mixed_parts():
"""Tests get_text_from_content with a mix of text and non-text parts."""
content = genai_types.Content(
parts=[
genai_types.Part(text="Hello"),
genai_types.Part(
function_call=genai_types.FunctionCall(name="test_func")
),
genai_types.Part(text="World"),
]
)
assert get_text_from_content(content) == "Hello\nWorld"
def test_get_eval_status_with_none_score():
"""Tests get_eval_status returns NOT_EVALUATED for a None score."""
assert get_eval_status(score=None, threshold=0.5) == EvalStatus.NOT_EVALUATED
def test_get_eval_status_when_score_is_greater_than_threshold():
"""Tests get_eval_status returns PASSED when score > threshold."""
assert get_eval_status(score=0.8, threshold=0.5) == EvalStatus.PASSED
def test_get_eval_status_when_score_is_equal_to_threshold():
"""Tests get_eval_status returns PASSED when score == threshold."""
assert get_eval_status(score=0.5, threshold=0.5) == EvalStatus.PASSED
def test_get_eval_status_when_score_is_less_than_threshold():
"""Tests get_eval_status returns FAILED when score < threshold."""
assert get_eval_status(score=0.4, threshold=0.5) == EvalStatus.FAILED
def test_get_average_rubric_score_with_empty_list():
"""Tests get_average_rubric_score returns None for an empty list."""
assert get_average_rubric_score([]) is None
def test_get_average_rubric_score_with_all_none_scores():
"""Tests get_average_rubric_score returns None when all scores are None."""
rubric_scores = [
RubricScore(rubric_id="1", score=None),
RubricScore(rubric_id="2", score=None),
]
assert get_average_rubric_score(rubric_scores) is None
def test_get_average_rubric_score_with_single_score():
"""Tests get_average_rubric_score with a single valid score."""
rubric_scores = [RubricScore(rubric_id="1", score=0.8)]
assert get_average_rubric_score(rubric_scores) == 0.8
def test_get_average_rubric_score_with_multiple_scores():
"""Tests get_average_rubric_score with multiple valid scores."""
rubric_scores = [
RubricScore(rubric_id="1", score=0.8),
RubricScore(rubric_id="2", score=0.6),
]
assert get_average_rubric_score(rubric_scores) == 0.7
def test_get_average_rubric_score_with_mixed_scores():
"""Tests get_average_rubric_score with a mix of valid and None scores."""
rubric_scores = [
RubricScore(rubric_id="1", score=0.8),
RubricScore(rubric_id="2", score=None),
RubricScore(rubric_id="3", score=0.6),
]
assert get_average_rubric_score(rubric_scores) == 0.7
def test_get_tool_declarations_as_json_str_with_no_agents():
"""Tests get_tool_declarations_as_json_str with no agents."""
app_details = AppDetails(agent_details={})
expected_json = {"tool_declarations": {}}
actual_json_str = get_tool_declarations_as_json_str(app_details)
assert json.loads(actual_json_str) == expected_json
def test_get_tool_declarations_as_json_str_with_agent_no_tools():
"""Tests get_tool_declarations_as_json_str with an agent that has no tools."""
agent_details = {"agent1": AgentDetails(name="agent1", tool_declarations=[])}
app_details = AppDetails(agent_details=agent_details)
expected_json = {"tool_declarations": {"agent1": []}}
actual_json_str = get_tool_declarations_as_json_str(app_details)
assert json.loads(actual_json_str) == expected_json
def test_get_tool_declarations_as_json_str_with_agent_with_tools():
"""Tests get_tool_declarations_as_json_str with an agent that has tools."""
tool1 = genai_types.Tool(
function_declarations=[
genai_types.FunctionDeclaration(
name="test_func", description="A test function."
)
]
)
agent_details = {
"agent1": AgentDetails(name="agent1", tool_declarations=[tool1])
}
app_details = AppDetails(agent_details=agent_details)
expected_json = {
"tool_declarations": {
"agent1": [{
"function_declarations": [{
"name": "test_func",
"description": "A test function.",
}]
}]
}
}
actual_json_str = get_tool_declarations_as_json_str(app_details)
assert json.loads(actual_json_str) == expected_json
def test_get_tool_declarations_as_json_str_with_multiple_agents():
"""Tests get_tool_declarations_as_json_str with multiple agents."""
tool1 = genai_types.Tool(
function_declarations=[
genai_types.FunctionDeclaration(
name="test_func1", description="A test function 1."
)
]
)
agent_details = {
"agent1": AgentDetails(name="agent1", tool_declarations=[tool1]),
"agent2": AgentDetails(name="agent2", tool_declarations=[]),
}
app_details = AppDetails(agent_details=agent_details)
expected_json = {
"tool_declarations": {
"agent1": [{
"function_declarations": [{
"name": "test_func1",
"description": "A test function 1.",
}]
}],
"agent2": [],
}
}
actual_json_str = get_tool_declarations_as_json_str(app_details)
assert json.loads(actual_json_str) == expected_json
def test_get_tool_calls_and_responses_as_json_str_with_none():
"""Tests get_tool_calls_and_responses_as_json_str with None."""
assert (
get_tool_calls_and_responses_as_json_str(None)
== "No intermediate steps were taken."
)
def test_get_tool_calls_and_responses_as_json_str_with_intermediate_data_no_tools():
"""Tests get_tool_calls_and_responses_as_json_str with IntermediateData and no tools."""
intermediate_data = IntermediateData(tool_uses=[], tool_responses=[])
assert (
get_tool_calls_and_responses_as_json_str(intermediate_data)
== "No intermediate steps were taken."
)
intermediate_data = InvocationEvents(invocation_events=[])
assert (
get_tool_calls_and_responses_as_json_str(intermediate_data)
== "No intermediate steps were taken."
)
def test_get_tool_calls_and_responses_as_json_str_with_invocation_events_multiple_calls():
"""Tests get_tool_calls_and_responses_as_json_str with multiple calls in InvocationEvents."""
tool_call1 = genai_types.FunctionCall(name="func1", args={}, id="call1")
tool_call2 = genai_types.FunctionCall(name="func2", args={}, id="call2")
tool_response1 = genai_types.FunctionResponse(
name="func1", response={"status": "ok"}, id="call1"
)
invocation_event1 = InvocationEvent(
author="agent",
content=genai_types.Content(
parts=[
genai_types.Part(function_call=tool_call1),
genai_types.Part(function_call=tool_call2),
]
),
)
invocation_event2 = InvocationEvent(
author="tool",
content=genai_types.Content(
parts=[genai_types.Part(function_response=tool_response1)]
),
)
intermediate_data = InvocationEvents(
invocation_events=[invocation_event1, invocation_event2]
)
json_str = get_tool_calls_and_responses_as_json_str(intermediate_data)
expected_json = {
"tool_calls_and_response": [
{
"step": 0,
"tool_call": {"name": "func1", "args": {}, "id": "call1"},
"tool_response": {
"name": "func1",
"response": {"status": "ok"},
"id": "call1",
},
},
{
"step": 1,
"tool_call": {"name": "func2", "args": {}, "id": "call2"},
"tool_response": "None",
},
]
}
assert json.loads(json_str) == expected_json
File diff suppressed because it is too large Load Diff