From 5a485b01cd64cb49735e13ebd5e7fa3da02cd85f Mon Sep 17 00:00:00 2001 From: Ankur Sharma Date: Wed, 24 Sep 2025 20:30:09 -0700 Subject: [PATCH] feat: Adds Rubric based final response evaluator The evaluator uses a set of rubrics to assess the quality of the agent's final response. PiperOrigin-RevId: 811154498 --- src/google/adk/evaluation/app_details.py | 14 + src/google/adk/evaluation/eval_case.py | 51 ++ src/google/adk/evaluation/eval_metrics.py | 4 + src/google/adk/evaluation/evaluator.py | 6 + .../adk/evaluation/final_response_match_v2.py | 11 +- src/google/adk/evaluation/llm_as_judge.py | 18 +- .../adk/evaluation/llm_as_judge_utils.py | 101 +++ .../adk/evaluation/local_eval_service.py | 15 +- .../evaluation/metric_evaluator_registry.py | 5 + .../rubric_based_final_response_quality_v1.py | 574 ++++++++++++++++ .../unittests/evaluation/test_app_details.py | 73 ++ tests/unittests/evaluation/test_eval_case.py | 149 ++++ .../test_final_response_match_v2.py | 27 +- .../unittests/evaluation/test_llm_as_judge.py | 5 +- .../evaluation/test_llm_as_judge_utils.py | 290 ++++++++ ..._rubric_based_final_response_quality_v1.py | 650 ++++++++++++++++++ 16 files changed, 1969 insertions(+), 24 deletions(-) create mode 100644 src/google/adk/evaluation/rubric_based_final_response_quality_v1.py create mode 100644 tests/unittests/evaluation/test_app_details.py create mode 100644 tests/unittests/evaluation/test_llm_as_judge_utils.py create mode 100644 tests/unittests/evaluation/test_rubric_based_final_response_quality_v1.py diff --git a/src/google/adk/evaluation/app_details.py b/src/google/adk/evaluation/app_details.py index 534dbf94..d7b6c949 100644 --- a/src/google/adk/evaluation/app_details.py +++ b/src/google/adk/evaluation/app_details.py @@ -47,3 +47,17 @@ class AppDetails(EvalBaseModel): default_factory=dict, ) """A mapping from the agent name to the details of that agent.""" + + def get_developer_instructions(self, agent_name: str) -> str: + """Returns a string containing the developer instructions.""" + if agent_name not in self.agent_details: + raise ValueError(f"`{agent_name}` not found in the agentic system.") + + return self.agent_details[agent_name].instructions + + def get_tools_by_agent_name(self) -> dict[str, genai_types.ToolListUnion]: + """Returns a dictionary of tools available to an agent in the App, keyed to the name of the Agent.""" + return { + name: details.tool_declarations + for name, details in self.agent_details.items() + } diff --git a/src/google/adk/evaluation/eval_case.py b/src/google/adk/evaluation/eval_case.py index df2c478e..1f736a7f 100644 --- a/src/google/adk/evaluation/eval_case.py +++ b/src/google/adk/evaluation/eval_case.py @@ -168,3 +168,54 @@ def get_all_tool_calls( ) return tool_calls + + +def get_all_tool_responses( + intermediate_data: Optional[IntermediateDataType], +) -> list[genai_types.FunctionResponse]: + """A utility method to retrieve tools responses from intermediate data.""" + if not intermediate_data: + return [] + + tool_responses = [] + if isinstance(intermediate_data, IntermediateData): + tool_responses = intermediate_data.tool_responses + elif isinstance(intermediate_data, InvocationEvents): + # Go over each event in the list of events + for invocation_event in intermediate_data.invocation_events: + # Check if the event has content and some parts. + if invocation_event.content and invocation_event.content.parts: + for p in invocation_event.content.parts: + # For each part, we check if any of those part is a function response. + if p.function_response: + tool_responses.append(p.function_response) + else: + raise ValueError( + f"Unsupported type for intermediate_data `{intermediate_data}`" + ) + + return tool_responses + + +ToolCallAndResponse: TypeAlias = tuple[ + genai_types.FunctionCall, Optional[genai_types.FunctionResponse] +] +"""A Tuple representing a Function call and corresponding optional function response.""" + + +def get_all_tool_calls_with_responses( + intermediate_data: Optional[IntermediateDataType], +) -> list[ToolCallAndResponse]: + """Returns tool calls with the corresponding responses, if available.""" + tool_responses_by_call_id: dict[str, genai_types.FunctionResponse] = { + tool_response.id: tool_response + for tool_response in get_all_tool_responses(intermediate_data) + } + + tool_call_and_responses: list[ToolCallAndResponse] = [] + + for tool_call in get_all_tool_calls(intermediate_data): + response = tool_responses_by_call_id.get(tool_call.id, None) + tool_call_and_responses.append((tool_call, response)) + + return tool_call_and_responses diff --git a/src/google/adk/evaluation/eval_metrics.py b/src/google/adk/evaluation/eval_metrics.py index 66f7299f..6dd62992 100644 --- a/src/google/adk/evaluation/eval_metrics.py +++ b/src/google/adk/evaluation/eval_metrics.py @@ -48,6 +48,10 @@ class PrebuiltMetrics(Enum): FINAL_RESPONSE_MATCH_V2 = "final_response_match_v2" + RUBRIC_BASED_FINAL_RESPONSE_QUALITY_V1 = ( + "rubric_based_final_response_quality_v1" + ) + MetricName: TypeAlias = Union[str, PrebuiltMetrics] Threshold: TypeAlias = float diff --git a/src/google/adk/evaluation/evaluator.py b/src/google/adk/evaluation/evaluator.py index 07ee9584..3cda983f 100644 --- a/src/google/adk/evaluation/evaluator.py +++ b/src/google/adk/evaluation/evaluator.py @@ -23,6 +23,7 @@ from typing_extensions import TypeAlias from .eval_case import Invocation from .eval_metrics import BaseCriterion from .eval_metrics import EvalStatus +from .eval_rubrics import RubricScore # Redefining the type here for backward compatibility. EvalStatus: TypeAlias = EvalStatus @@ -35,6 +36,7 @@ class PerInvocationResult(BaseModel): expected_invocation: Invocation score: Optional[float] = None eval_status: EvalStatus = EvalStatus.NOT_EVALUATED + rubric_scores: Optional[list[RubricScore]] = None class EvaluationResult(BaseModel): @@ -45,6 +47,10 @@ class EvaluationResult(BaseModel): """Overall status, based on each invocation.""" per_invocation_results: list[PerInvocationResult] = [] + """Detailed results per invocation.""" + + overall_rubric_scores: Optional[list[RubricScore]] = None + """Overall rubric, based on each invocation.""" class Evaluator(ABC): diff --git a/src/google/adk/evaluation/final_response_match_v2.py b/src/google/adk/evaluation/final_response_match_v2.py index 827f397b..0e8e666c 100644 --- a/src/google/adk/evaluation/final_response_match_v2.py +++ b/src/google/adk/evaluation/final_response_match_v2.py @@ -33,6 +33,7 @@ from .eval_metrics import MetricValueInfo from .eval_metrics import PrebuiltMetrics from .evaluator import EvaluationResult from .evaluator import PerInvocationResult +from .llm_as_judge import AutoRaterScore from .llm_as_judge import LlmAsJudge from .llm_as_judge_utils import get_eval_status from .llm_as_judge_utils import get_text_from_content @@ -179,17 +180,17 @@ class FinalResponseMatchV2Evaluator(LlmAsJudge): @override def convert_auto_rater_response_to_score( self, llm_response: LlmResponse - ) -> Optional[float]: + ) -> AutoRaterScore: response_text = get_text_from_content(llm_response.content) if response_text is None: - return None + return AutoRaterScore() label = _parse_critique(response_text) if label == Label.VALID: - return 1.0 + return AutoRaterScore(score=1.0) elif label == Label.INVALID: - return 0.0 + return AutoRaterScore(score=0.0) else: - return None + return AutoRaterScore() @override def aggregate_per_invocation_samples( diff --git a/src/google/adk/evaluation/llm_as_judge.py b/src/google/adk/evaluation/llm_as_judge.py index cf86ffbb..4287a3e0 100644 --- a/src/google/adk/evaluation/llm_as_judge.py +++ b/src/google/adk/evaluation/llm_as_judge.py @@ -26,15 +26,22 @@ from ..models.llm_request import LlmRequest from ..models.llm_response import LlmResponse from ..models.registry import LLMRegistry from ..utils.context_utils import Aclosing +from .common import EvalBaseModel from .eval_case import Invocation from .eval_metrics import BaseCriterion from .eval_metrics import EvalMetric +from .eval_metrics import RubricScore from .evaluator import EvaluationResult from .evaluator import Evaluator from .evaluator import PerInvocationResult from .llm_as_judge_utils import get_eval_status +class AutoRaterScore(EvalBaseModel): + score: Optional[float] = None + rubric_scores: Optional[list[RubricScore]] = None + + class LlmAsJudge(Evaluator): """Evaluator based on a LLM. @@ -82,7 +89,7 @@ class LlmAsJudge(Evaluator): @abstractmethod def convert_auto_rater_response_to_score( self, auto_rater_response: LlmResponse - ) -> Optional[float]: + ) -> AutoRaterScore: """Parses auto_rater_response and returns the corresponding score, or None if the score cannot be determined.""" @abstractmethod @@ -126,15 +133,18 @@ class LlmAsJudge(Evaluator): ) as agen: async for llm_response in agen: # Non-streaming call, so there is only one response content. - score = self.convert_auto_rater_response_to_score(llm_response) + auto_rater_score = self.convert_auto_rater_response_to_score( + llm_response + ) invocation_result_samples.append( PerInvocationResult( actual_invocation=actual, expected_invocation=expected, - score=score, + score=auto_rater_score.score, eval_status=get_eval_status( - score, self._criterion.threshold + auto_rater_score.score, self._eval_metric.threshold ), + rubric_scores=auto_rater_score.rubric_scores, ) ) if not invocation_result_samples: diff --git a/src/google/adk/evaluation/llm_as_judge_utils.py b/src/google/adk/evaluation/llm_as_judge_utils.py index c5b780fc..5d17b0c4 100644 --- a/src/google/adk/evaluation/llm_as_judge_utils.py +++ b/src/google/adk/evaluation/llm_as_judge_utils.py @@ -15,10 +15,17 @@ from __future__ import annotations import enum +import statistics from typing import Optional +from typing import Union from google.genai import types as genai_types +from .app_details import AppDetails +from .common import EvalBaseModel +from .eval_case import get_all_tool_calls_with_responses +from .eval_case import IntermediateDataType +from .eval_metrics import RubricScore from .evaluator import EvalStatus @@ -46,3 +53,97 @@ def get_eval_status(score: Optional[float], threshold: float) -> EvalStatus: if score is None: return EvalStatus.NOT_EVALUATED return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED + + +def get_average_rubric_score( + rubric_scores: list[RubricScore], +) -> Optional[float]: + """Returns a single score value from the given list of rubric scores. + + It is possible that none of the rubric score actually contain a score value, + if that happens then None is returned. + + If non-zero score values are present, then a mean value is returned as the + aggregated value. + """ + rubric_scores = [ + rubric_score.score + for rubric_score in rubric_scores + if rubric_score.score is not None + ] + + return statistics.mean(rubric_scores) if rubric_scores else None + + +class _ToolDeclarations(EvalBaseModel): + """Internal data model used for serializing Tool declarations.""" + + tool_declarations: dict[str, genai_types.ToolListUnion] + + +def get_tool_declarations_as_json_str( + app_details: AppDetails, +) -> str: + """Returns a JSON string representation of Tool declarations. + + The output of this method is usually intended to be sent to the LLM. + """ + tool_declarations = _ToolDeclarations( + tool_declarations=app_details.get_tools_by_agent_name() + ) + return tool_declarations.model_dump_json( + indent=2, + exclude_unset=True, + exclude_defaults=True, + exclude_none=True, + ) + + +class _ToolCallAndResponse(EvalBaseModel): + """Internal data model to capture one single tool call and response.""" + + step: int + tool_call: genai_types.FunctionCall + tool_response: Union[genai_types.FunctionResponse, str] + + +class _ToolCallsAndResponses(EvalBaseModel): + """Internal data model used for serializing Tool call and responses.""" + + tool_calls_and_response: list[_ToolCallAndResponse] + + +def get_tool_calls_and_responses_as_json_str( + intermediate_data: Optional[IntermediateDataType], +) -> str: + """Returns a JSON string representation of tool calls and corresponding responses. + + The output of this method is usually intended to be sent to the LLM. + """ + raw_tool_calls_and_response = get_all_tool_calls_with_responses( + intermediate_data + ) + + if not raw_tool_calls_and_response: + return "No intermediate steps were taken." + + tool_calls_and_responses = [] + for idx, (tool_call, tool_response) in enumerate(raw_tool_calls_and_response): + tool_calls_and_responses.append( + _ToolCallAndResponse( + step=idx, + tool_call=tool_call, + tool_response=tool_response if tool_response else "None", + ) + ) + + internal_tool_calls_and_responses = _ToolCallsAndResponses( + tool_calls_and_response=tool_calls_and_responses + ) + + return internal_tool_calls_and_responses.model_dump_json( + indent=2, + exclude_unset=True, + exclude_defaults=True, + exclude_none=True, + ) diff --git a/src/google/adk/evaluation/local_eval_service.py b/src/google/adk/evaluation/local_eval_service.py index fa50f70d..84e26cb1 100644 --- a/src/google/adk/evaluation/local_eval_service.py +++ b/src/google/adk/evaluation/local_eval_service.py @@ -40,6 +40,7 @@ from .base_eval_service import InferenceStatus from .eval_case import Invocation from .eval_metrics import EvalMetric from .eval_metrics import EvalMetricResult +from .eval_metrics import EvalMetricResultDetails from .eval_metrics import EvalMetricResultPerInvocation from .eval_result import EvalCaseResult from .eval_set import EvalCase @@ -239,12 +240,15 @@ class LocalEvalService(BaseEvalService): ) # Track overall scrore across all invocations. + eval_metric_result_details = EvalMetricResultDetails( + rubric_scores=evaluation_result.overall_rubric_scores + ) overall_eval_metric_results.append( EvalMetricResult( - metric_name=eval_metric.metric_name, - threshold=eval_metric.threshold, score=evaluation_result.overall_score, eval_status=evaluation_result.overall_eval_status, + details=eval_metric_result_details, + **eval_metric.model_dump(), ) ) @@ -262,12 +266,15 @@ class LocalEvalService(BaseEvalService): evaluation_result.per_invocation_results, eval_metric_result_per_invocation, ): + eval_metric_result_details = EvalMetricResultDetails( + rubric_scores=invocation_result.rubric_scores + ) invocation.eval_metric_results.append( EvalMetricResult( - metric_name=eval_metric.metric_name, - threshold=eval_metric.threshold, score=invocation_result.score, eval_status=invocation_result.eval_status, + details=eval_metric_result_details, + **eval_metric.model_dump(), ) ) diff --git a/src/google/adk/evaluation/metric_evaluator_registry.py b/src/google/adk/evaluation/metric_evaluator_registry.py index e2bcd5f8..866d2151 100644 --- a/src/google/adk/evaluation/metric_evaluator_registry.py +++ b/src/google/adk/evaluation/metric_evaluator_registry.py @@ -24,6 +24,7 @@ from .eval_metrics import PrebuiltMetrics from .evaluator import Evaluator from .final_response_match_v2 import FinalResponseMatchV2Evaluator from .response_evaluator import ResponseEvaluator +from .rubric_based_final_response_quality_v1 import RubricBasedFinalResponseQualityV1Evaluator from .safety_evaluator import SafetyEvaluatorV1 from .trajectory_evaluator import TrajectoryEvaluator @@ -111,6 +112,10 @@ def _get_default_metric_evaluator_registry() -> MetricEvaluatorRegistry: metric_info=FinalResponseMatchV2Evaluator.get_metric_info(), evaluator=FinalResponseMatchV2Evaluator, ) + metric_evaluator_registry.register_evaluator( + metric_info=RubricBasedFinalResponseQualityV1Evaluator.get_metric_info(), + evaluator=RubricBasedFinalResponseQualityV1Evaluator, + ) return metric_evaluator_registry diff --git a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py new file mode 100644 index 00000000..e9461f9c --- /dev/null +++ b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py @@ -0,0 +1,574 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import logging +import re +from typing import ClassVar +from typing import Optional + +from typing_extensions import override + +from ..models.llm_response import LlmResponse +from ..utils.feature_decorator import experimental +from .common import EvalBaseModel +from .eval_case import Invocation +from .eval_case import InvocationEvents +from .eval_metrics import EvalMetric +from .eval_metrics import Interval +from .eval_metrics import MetricInfo +from .eval_metrics import MetricValueInfo +from .eval_metrics import PrebuiltMetrics +from .eval_metrics import RubricsBasedCriterion +from .eval_rubrics import Rubric +from .eval_rubrics import RubricScore +from .evaluator import EvaluationResult +from .evaluator import PerInvocationResult +from .llm_as_judge import AutoRaterScore +from .llm_as_judge import LlmAsJudge +from .llm_as_judge_utils import get_average_rubric_score +from .llm_as_judge_utils import get_eval_status +from .llm_as_judge_utils import get_text_from_content +from .llm_as_judge_utils import get_tool_calls_and_responses_as_json_str +from .llm_as_judge_utils import get_tool_declarations_as_json_str + +logger = logging.getLogger("google_adk." + __name__) + +_RUBRIC_BASED_FINAL_RESPONSE_QUALITY_V1_PROMPT = """ +SPECIAL INSTRUCTION: think silently. Silent thinking token budget: 10240 tokens. + +# Mission +Your mission is to evaluate the final answer quality of responses generated by an AI agent. You will be presented with a user prompt (), the agent's response () to that user prompt, and a set of properties () that you must use to objectively assess the validity of the agent's response. +Only respond to the properties provided. Do not make up new properties. + +# Rubric +"yes": The model's response fulfilled the property, OR the property's condition was not applicable to the response. +"no": The model's response met the conditions for the property to be applicable, but failed to fulfill it, or the property applies to a claim in the model's response that cannot be unambiguously verified using trusted evidence. + +# Key Evaluation Principles +Your evaluation must follow a two-part process: first, collect trusted evidence from the agent's work, and second, judge the final answer against it. +1. **Establish Trusted Evidence from Tool Calls**: You must first examine the agent's tool calls to determine if they are procedurally sound, meaning that the agent used the appropriate tools with logical parameters to address the user's prompt. + * Your ONLY sources of truth are the and the direct output ('tool_response') from PROCEDURALLY SOUND tool calls found in the . Examples of procedural flaws include: + * The agent failed to call a tool that will enable it to answer the user's prompt despite having all the necessary parameters to do so. + * The agent called the tool with incorrect or missing parameters. + * The agent called a tool that does not exist, or called a tool with a parameter that does not exist. + * The agent's sequence of tool calls contains a logical error. + * The following kinds of information ABSOLUTELY CANNOT BE USED to derive trusted evidence: + * The agent's final answer. + * The agent's reasoning, summaries, or any interpretations of the tool responses by the agent. + * Any tool call that is flawed (e.g., queries the wrong file, contains incorrect logic). + * You may not have access to the same tools as the agent, so do not attempt to call any tools yourself. +2. **Judge Consistency with the Evidence**: Once you have collected trusted evidence from tool calls, you must determine whether the agent's is consistent with it. A claim in the final answer is only considered correct if it can be unambiguously verified using this evidence. + * If the necessary evidence is missing because the agent failed to make a correct and sound tool call, the final answer must be judged as failing the property. + +While judging the final answer against the evidence, be flexible about how it is conveyed. Accept answers that are semantically equivalent (e.g., different phrasing) as long as they still fulfill the property. For numbers, accept answers that are numerically equivalent, allowing for minor differences in rounding or precision, as long as they do not alter a final conclusion (e.g., the outcome of a statistical test). + +For each property follow these internal steps: +1. Understand the property and the key evaluation principles. +2. Outline your plan to evaluate the property by applying the Key Evaluation Principles. +3. Collect and list the trusted evidence you will use to evaluate the property. Note any procedural flaws in the tool calls. +4. Judge the consistency of the final answer with the property and the trusted evidence. +5. Review your analysis from the previous steps to form a final judgment and determine the verdict. +6. Output the final verdict in the required output format. + +# Output Format (repeat this format for every property, starting with a new line): +Property: [Repeat the property, word for word, without making any changes. Keep everything including punctuation and capitalization as-is.] +Evidence: [List all trusted evidence from tool calls or the user prompt that is relevant to the property (referencing the Step Index). Alternatively, if either no trusted evidence is required, or no trusted evidence exists (e.g., flawed process, missing tool call, tool error), explain why.] +Rationale: [Explain your reasoning, detailing how the evidence (or lack thereof) supports or contradicts the final answer, or why the property is not applicable.] +Verdict: [yes|no] + +REMEMBER: Your answer will help improve the AI agent. It is important to determine the fulfillment of the properties correctly. Even answering "no" will improve the agent! Respond in pure text, not json. + +# Example +## Input + + + You are an AI agent who is an expert in HR data analysis. + If a company has fewer than 100 employees, then the final answer should alert the user that there are fewer than 100 employees. + If you have sufficient information and tools to respond to the user's question, then do not ask for further clarification. + + + {{ + 'name': 'load_hr_data_from_file', + 'description': 'Reads a data file from the company's HR database into a Pandas DataFrame.' + 'parameters': [ + {{ + 'type': 'string', + 'name': 'file_name', + 'description': 'The name of the data file.' + }}, + ], + 'required': ['file_name'] + }}, + {{ + 'name': 'get_manager', + 'description': 'Returns the manager of a given employee.', + 'parameters': [ + {{ + 'type': 'string', + 'name': 'employee_name', + 'description': 'The name of the employee.' + }}, + ], + 'required': ['employee_name'] + }} + + + Using the employees.csv file, determine: + 1. the total number of employees + 2. the name of Alice Smith's manager + 3. the name of the employee with the highest salary, and their gender + 4. the average salary for the "Marketing" department + Please format your final answer as a numbered list. + + + + + [ + {{ + "step_index": 0, + "tool_call": "df = load_hr_data_from_file('employees.csv')\nprint(len(df))", + "tool_response": "110", + }}, + {{ + "step_index": 1, + "tool_call": "print(df[df['Department'] == 'Engineering']['Salary'].mean())", + "tool_response": "155000", + }}, + {{ + "step_index": 2, + "tool_call="print(df.loc[df['Salary'].idxmax(), 'Name'])", + "tool_response": "John Smith", + }}, + ] + + + 1. The total number of employees is 110. + 2. Please provide Alice Smith's employee ID so that I can find her manager. + 3. The employee with the highest salary is John Doe, and this employee's gender is male. + 4. The average salary for the Marketing department is 155000. + + + + +* The final answer correctly identifies the total number of employees. +* The final answer correctly identifies the name of Alice Smith's manager, or correctly states that it cannot be determined and why. +* The final answer correctly states the average salary for the Marketing department. +* The final answer correctly identifies the employee with the highest salary. +* The final answer correctly identifies the gender of the employee with the highest salary, or correctly states that it cannot be determined and why. +* The final answer is formatted as a numbered list. +* If the company has fewer than 100 employees, then the final answer states that it has fewer than 100 employees. + + +## Output +Property: The final answer correctly identifies the total number of employees. +Evidence: The trusted evidence is "110 employees". The tool call in Step 0 is procedurally sound and provides the total number of employees (110) by calling the load_hr_data_from_file tool with the correct file name. +Rationale: The final answer's claim ("110 employees") is fully consistent with the trusted evidence. +Verdict: yes + +Property: The final answer correctly identifies the name of Alice Smith's manager, or correctly states that it cannot be determined and why. +Evidence: No trusted evidence exists. The agent did not perform a tool call to determine the manager of Alice Smith, despite having the necessary information (the employee name) and access to the necessary tools (get_manager) to do so. +Rationale: The agent incorrectly stated that the final answer cannot be determined, despite having the necessary information (the employee name) and tools (get_manager) to determine it. +Verdict: no + +Property: The final answer correctly states the average salary for the Marketing department. +Evidence: No trusted evidence exists for the Marketing department's average salary. The tool call in Step 1 is procedurally flawed; the agent searched for "Engineering" instead of "Marketing". +Rationale: There is no trusted evidence for the Marketing department's average salary. +Verdict: no + +Property: The final answer correctly identifies the employee with the highest salary. +Evidence: The trusted evidence is "John Smith". The tool call in Step 2 produces trusted evidence for the employee with the highest salary by calling the load_hr_data_from_file tool with the correct file name and then using the idxmax() method to find the employee with the highest salary. +Rationale: The final answer's claim ("John Doe") is inconsistent with the trusted evidence ("John Smith"). +Verdict: no + +Property: The final answer correctly identifies the gender of the employee with the highest salary, or correctly states that it cannot be determined and why. +Evidence: No trusted evidence exists. The agent did not perform a tool call to determine the gender of the employee with the highest salary. +Rationale: There is no trusted evidence to confirm the gender of the employee with the highest salary that the final answer states (male). Even if the gender is coincidentally actually male, the claim in the final answer cannot be unambiguously verified using the evidence. +Verdict: no + +Property: If the company has fewer than 100 employees, then the final answer should state that it has fewer than 100 employees. +Evidence: The trusted evidence is "110 employees". The tool call in Step 0 correctly counts the total number of employees as 110 by calling the load_hr_data_from_file tool with the correct file name. +Rationale: The total number of employees is 110, so the condition for this property (fewer than 100 employees) was not met. Therefore, the property is not applicable to this response. +Verdict: yes + +Property: The final answer is formatted as a numbered list. +Evidence: N/A. Trusted evidence from tool calls or the user prompt is not required in order to determine the format of the final answer. +Rationale: The final answer is formatted as a numbered list from 1 to 4, e.g. "1. The total number of employees is 110\n2...". +Verdict: yes + +# Your Turn +## Input + + + {developer_instructions} + + + + {tool_declarations} + + + + {user_input} + + + + + + {response_steps} + + + {final_response} + + + + +{rubrics} + + +## Output +""" + + +_PROPERTY_PATTERN = r"(?<=Property: )(.*)" +_RATIONALE_PATTERN = r"(?<=Rationale: )(.*)" +_VERDICT_PATTERN = r"(?<=Verdict: )(.*)" + + +class _RubricResponse(EvalBaseModel): + """Internal data model to represent a rubric's response from the auto-rater.""" + + property_text: Optional[str] = None + rationale: Optional[str] = None + score: Optional[float] = None + + +def _normalize_text(text: str) -> str: + """Returns a normalized version of the passed in text.""" + if not isinstance(text, str): + return "" + return text.lower().strip() + + +def _parse_auto_rater_response( + auto_rater_response: str, +) -> list[_RubricResponse]: + """Returns a list of _RubricResponse parsed from the AutoRater's response.""" + properties = re.findall(_PROPERTY_PATTERN, auto_rater_response) + rationales = re.findall(_RATIONALE_PATTERN, auto_rater_response) + scores = [] + + for verdict in re.findall(_VERDICT_PATTERN, auto_rater_response): + if "yes" in verdict.lower(): + score = 1.0 + elif "no" in verdict.lower(): + score = 0.0 + else: + score = None + + scores.append(score) + + rubric_responses = [] + for p, r, s in zip(properties, rationales, scores): + rubric_responses.append( + _RubricResponse(property_text=p.strip(), rationale=r.strip(), score=s) + ) + + return rubric_responses + + +@experimental +class RubricBasedFinalResponseQualityV1Evaluator(LlmAsJudge): + """An Evaluator for rubric based assessment of the agent's final response using a LLM. + + The evaluator uses a set of rubrics to assess the quality of the agent's + final response. + + Example: For a weather agent that responds to weather related queries of the + user, one could specify following rubrics: + + Rubric 1: Agent's response is direct and to the point. + Rubric 2: Agent's response accurately inferred user's underlying goal from + ambiguous queries (e.g. "is it a beach weather?" would mean sun, warmth and + low wind) + + For each rubric, this evaluator will generate a confidence score between 0 + and 1, where 0 means that agent's response did not satisfy the rubric at all + and 1 means complete adherence. Value closer to 1 are desirable. + + A combined score using individual rubric confidences will also be generated. + Like individual rubric confidence scores, the range for this value will be + between 0 and 1, and it will have the same interpretation. + """ + + criterion_type: ClassVar[type[RubricsBasedCriterion]] = RubricsBasedCriterion + + def __init__(self, eval_metric: EvalMetric): + super().__init__( + eval_metric, + criterion_type=RubricBasedFinalResponseQualityV1Evaluator.criterion_type, + ) + self._auto_rater_prompt_template = ( + _RUBRIC_BASED_FINAL_RESPONSE_QUALITY_V1_PROMPT + ) + + assert self._criterion.rubrics, "Rubrics are required." + + self._rubrics: list[Rubric] = self._criterion.rubrics + + self._normalized_rubric_to_id_map = { + _normalize_text(r.rubric_content.text_property): r.rubric_id + for r in self._rubrics + } + + @staticmethod + def get_metric_info() -> MetricInfo: + return MetricInfo( + metric_name=PrebuiltMetrics.RUBRIC_BASED_FINAL_RESPONSE_QUALITY_V1.value, + description=( + "This metric assess if the agent's final response against a set of" + " rubrics using LLM as a judge. Value range for this metric is" + " [0,1], with values closer to 1 more desirable." + ), + metric_value_info=MetricValueInfo( + interval=Interval(min_value=0.0, max_value=1.0) + ), + ) + + @override + def format_auto_rater_prompt( + self, actual_invocation: Invocation, _: Invocation + ) -> str: + """Returns the autorater prompt.""" + + user_input = get_text_from_content(actual_invocation.user_content) + final_response = get_text_from_content(actual_invocation.final_response) + rubrics = "\n* ".join( + [r.rubric_content.text_property for r in self._rubrics] + ) + + developer_instructions = "" + tool_declarations = "Agent has no tools." + response_steps = get_tool_calls_and_responses_as_json_str( + actual_invocation.intermediate_data + ) + + app_details = actual_invocation.app_details + if app_details: + if ( + isinstance(actual_invocation.intermediate_data, InvocationEvents) + and actual_invocation.intermediate_data.invocation_events + ): + developer_instructions = app_details.get_developer_instructions( + agent_name=actual_invocation.intermediate_data.invocation_events[ + 0 + ].author + ) + tool_declarations = get_tool_declarations_as_json_str(app_details) + + auto_rater_prompt = self._auto_rater_prompt_template.format( + developer_instructions=developer_instructions, + tool_declarations=tool_declarations, + user_input=user_input, + response_steps=response_steps, + final_response=final_response, + rubrics=rubrics, + ) + + return auto_rater_prompt + + @override + def convert_auto_rater_response_to_score( + self, auto_rater_response: LlmResponse + ) -> AutoRaterScore: + """Returns an AutoRaterScore generated from AutoRater's response.""" + response_text = get_text_from_content(auto_rater_response.content) + rubric_responses = _parse_auto_rater_response(response_text) + rubric_scores = [] + + for rubric_response in rubric_responses: + normalized_rubric = _normalize_text(rubric_response.property_text) + rubric_id = self._normalized_rubric_to_id_map.get(normalized_rubric, None) + if rubric_id: + rubric_scores.append( + RubricScore( + rubric_id=rubric_id, + rationale=rubric_response.rationale, + score=rubric_response.score, + ) + ) + else: + logger.warning( + f"Rubric {rubric_response.property_text} not found in the rubrics" + " provided to the metric." + ) + + aggregated_score = get_average_rubric_score(rubric_scores) + return AutoRaterScore(score=aggregated_score, rubric_scores=rubric_scores) + + @override + def aggregate_per_invocation_samples( + self, + per_invocation_samples: list[PerInvocationResult], + ) -> PerInvocationResult: + """Returns a combined result for the invocation. + + This AutoRater is backed by an LLM that are known to have certain degree of + unreliabilty to their responses. In order to counter that we sample the + autorater more than once for a single invocation. + + This method takes all those samples for a single invocation and combines + them to genreate one single result for the invocation. + + This method specifically uses majority vote to aggregate scores for a + rubric. Take following Invocation and Rubric for example: + + Invocation: + User: Is it going to be cold in Seattle tomorrow? + Weather Agent: No, it will be moderately warm as predicted temperature + for Seattle, WA tomorrow is 88F. + + Rubric: Agent's reponse was concise and to the point. + + We will sample the AutoRater 5 times, and the AutoRater responds + with (skipping the rationale field for now): + Sample 1: + Verdict: Yes + Sample 2: + Verdict: No + Sample 3: + Verdict: Yes + Sample 4: + Verdict: Yes + Sample 5: + Verdict: No + + This method will use majority vote and combine the results of 5 samples + into one, and it will report "Yes" as the final verdict. + """ + score_category_by_rubric_id = {} + + # We go over each rubric for each sample, and categorize the rubric into + # one of the following buckets: + # - Bucket 0: No score was generated for the rubric + # - Bucket 1: Score was generated and it was positive (1.0) + # - Bucket 2: Score was generated and it was negative (0.0) + for sample in per_invocation_samples: + if not sample.rubric_scores: + continue + + for rubric_score in sample.rubric_scores: + rubric_id = rubric_score.rubric_id + if rubric_id not in score_category_by_rubric_id: + score_category_by_rubric_id[rubric_id] = ([], [], []) + + if rubric_score.score is None: # No score + score_category_by_rubric_id[rubric_id][0].append(rubric_score) + elif rubric_score.score == 1.0: # Positive Result + score_category_by_rubric_id[rubric_id][1].append(rubric_score) + else: # Negative result + score_category_by_rubric_id[rubric_id][2].append(rubric_score) + + aggregated_rubric_scores = [] + for rubric_id in score_category_by_rubric_id: + no_scores, positives, negatives = score_category_by_rubric_id[rubric_id] + + if not positives and not negatives: + # There has to be at least a no score rubric! + aggregated_rubric_scores.append(no_scores[0]) + + # This is where we are taking a majority vote. + elif len(positives) > len(negatives): + aggregated_rubric_scores.append(positives[0]) + else: + aggregated_rubric_scores.append(negatives[0]) + + aggregated_overall_score = get_average_rubric_score( + aggregated_rubric_scores + ) + + return PerInvocationResult( + actual_invocation=per_invocation_samples[0].actual_invocation, + expected_invocation=per_invocation_samples[0].expected_invocation, + score=aggregated_overall_score, + rubric_scores=aggregated_rubric_scores, + eval_status=get_eval_status( + aggregated_overall_score, self._eval_metric.threshold + ), + ) + + @override + def aggregate_invocation_results( + self, per_invocation_results: list[PerInvocationResult] + ) -> EvaluationResult: + """Aggregates per invocation evaluation results into a single score. + + A single eval case can have multiple invocations and the eval metric is + assessed for each invocation. But, we do want to make an aggregate + statement on how the eval case as a whole performed on the metric. + + This method helps us aggreate rubric scores across invocation. + + Do note that the aggregation strategy used here is different from the one + that is used in `aggregate_per_invocation_samples` method, where we used + majority vote. In this method, we actually calculate the mean score of a + rubric across several invocations, as majority score would be misleading. + """ + + unaggregated_rubric_scores = [] # Later used to calculate average. + + # Collect rubric scores by id, so that we can calculate average score + # for each rubric id. + rubric_scores_by_id = {} + for sample in per_invocation_results: + if not sample.rubric_scores: + continue + + for rubric_score in sample.rubric_scores: + rubric_id = rubric_score.rubric_id + if rubric_id not in rubric_scores_by_id: + rubric_scores_by_id[rubric_id] = [] + + rubric_scores_by_id[rubric_id].append(rubric_score) + unaggregated_rubric_scores.append(rubric_score) + + aggregated_rubric_scores = [] + for rubric_id, rubric_scores in rubric_scores_by_id.items(): + overall_score = get_average_rubric_score(rubric_scores) + aggregated_rubric_scores.append( + RubricScore( + rubric_id=rubric_id, + score=overall_score, + # There is no real way for us generate a rationale here, so we + # make is clear to the consumer of the result. + rationale=( + "This is an aggregated score derived from individual entries." + " Please refer to individual entries in each invocation for" + " actual rationale from the model." + ), + ) + ) + + # Use unaggregate rubric score to calculate overall score. + aggregated_overall_score = get_average_rubric_score( + unaggregated_rubric_scores + ) + return EvaluationResult( + overall_score=aggregated_overall_score, + overall_eval_status=get_eval_status( + aggregated_overall_score, self._eval_metric.threshold + ), + per_invocation_results=per_invocation_results, + overall_rubric_scores=aggregated_rubric_scores, + ) diff --git a/tests/unittests/evaluation/test_app_details.py b/tests/unittests/evaluation/test_app_details.py new file mode 100644 index 00000000..b96581f5 --- /dev/null +++ b/tests/unittests/evaluation/test_app_details.py @@ -0,0 +1,73 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from google.adk.evaluation.app_details import AgentDetails +from google.adk.evaluation.app_details import AppDetails +from google.genai import types as genai_types +from pytest import raises + + +def test_get_developer_instructions_existing_agent(): + agent_details = { + 'agent1': AgentDetails( + name='agent1', instructions='instruction for agent1' + ), + 'agent2': AgentDetails( + name='agent2', instructions='instruction for agent2' + ), + } + app_details = AppDetails( + agent_details=agent_details, + ) + + # Test for existing agent + instructions = app_details.get_developer_instructions('agent1') + assert instructions == 'instruction for agent1' + + +def test_get_developer_instructions_non_existing_Agent(): + agent_details = { + 'agent1': AgentDetails( + name='agent1', instructions='instruction for agent1' + ), + 'agent2': AgentDetails( + name='agent2', instructions='instruction for agent2' + ), + } + app_details = AppDetails( + agent_details=agent_details, + ) + + # Test for existing agent + with raises(ValueError, match='`agent3` not found in the agentic system.'): + app_details.get_developer_instructions('agent3') + + +def test_get_tools_by_agent_name(): + tool1 = genai_types.Tool( + function_declarations=[genai_types.FunctionDeclaration(name='tool1_func')] + ) + agent_details = { + 'agent1': AgentDetails(name='agent1', tool_declarations=[tool1]), + 'agent2': AgentDetails(name='agent2', tool_declarations=[]), + } + app_details = AppDetails( + agent_details=agent_details, + ) + + tools = app_details.get_tools_by_agent_name() + expected_tools = {'agent1': [tool1], 'agent2': []} + assert tools == expected_tools diff --git a/tests/unittests/evaluation/test_eval_case.py b/tests/unittests/evaluation/test_eval_case.py index 01cb9b62..bea81d46 100644 --- a/tests/unittests/evaluation/test_eval_case.py +++ b/tests/unittests/evaluation/test_eval_case.py @@ -15,6 +15,8 @@ from __future__ import annotations from google.adk.evaluation.eval_case import get_all_tool_calls +from google.adk.evaluation.eval_case import get_all_tool_calls_with_responses +from google.adk.evaluation.eval_case import get_all_tool_responses from google.adk.evaluation.eval_case import IntermediateData from google.adk.evaluation.eval_case import InvocationEvent from google.adk.evaluation.eval_case import InvocationEvents @@ -97,3 +99,150 @@ def test_get_all_tool_calls_with_unsupported_type(): ValueError, match='Unsupported type for intermediate_data' ): get_all_tool_calls('this is not a valid type') + + +def test_get_all_tool_responses_with_none_input(): + """Tests that an empty list is returned when intermediate_data is None.""" + assert get_all_tool_responses(None) == [] + + +def test_get_all_tool_responses_with_empty_invocation_events(): + """Tests InvocationEvents with an empty list of events.""" + intermediate_data = InvocationEvents(invocation_events=[]) + assert get_all_tool_responses(intermediate_data) == [] + + +def test_get_all_tool_responses_with_invocation_events_no_tools(): + """Tests InvocationEvents containing events without any tool responses.""" + invocation_event = InvocationEvent( + author='agent', + content=genai_types.Content( + parts=[genai_types.Part(text='Thinking...')], role='model' + ), + ) + intermediate_data = InvocationEvents(invocation_events=[invocation_event]) + assert get_all_tool_responses(intermediate_data) == [] + + +def test_get_all_tool_responses_with_invocation_events(): + """Tests that tool responses are correctly extracted from a InvocationEvents object.""" + tool_response1 = genai_types.FunctionResponse( + name='search', response={'result': 'weather is good'} + ) + tool_response2 = genai_types.FunctionResponse( + name='lookup', response={'id': '123'} + ) + invocation_event1 = InvocationEvent( + author='agent1', + content=genai_types.Content( + parts=[genai_types.Part(function_response=tool_response1)], + role='model', + ), + ) + invocation_event2 = InvocationEvent( + author='agent2', + content=genai_types.Content( + parts=[ + genai_types.Part(text='Found something.'), + genai_types.Part(function_response=tool_response2), + ], + role='model', + ), + ) + intermediate_data = InvocationEvents( + invocation_events=[invocation_event1, invocation_event2] + ) + assert get_all_tool_responses(intermediate_data) == [ + tool_response1, + tool_response2, + ] + + +def test_get_all_tool_responses_with_unsupported_type(): + """Tests that a ValueError is raised for unsupported intermediate_data types.""" + with pytest.raises( + ValueError, match='Unsupported type for intermediate_data' + ): + get_all_tool_responses('this is not a valid type') + + +def test_get_all_tool_calls_with_responses_with_none_input(): + """Tests that an empty list is returned when intermediate_data is None.""" + assert get_all_tool_calls_with_responses(None) == [] + + +def test_get_all_tool_calls_with_responses_with_intermediate_data_no_tool_calls(): + """Tests get_all_tool_calls_with_responses with IntermediateData with no tool calls.""" + # No tool calls + intermediate_data = IntermediateData(tool_uses=[], tool_responses=[]) + assert get_all_tool_calls_with_responses(intermediate_data) == [] + + +def test_get_all_tool_calls_with_responses_with_intermediate_data_with_tool_calls(): + """Tests get_all_tool_calls_with_responses with IntermediateData with tools.""" + # With matching and non-matching tool calls + tool_call1 = genai_types.FunctionCall( + name='search', args={'query': 'weather'}, id='call1' + ) + tool_response1 = genai_types.FunctionResponse( + name='search', response={'result': 'sunny'}, id='call1' + ) + tool_call2 = genai_types.FunctionCall( + name='lookup', args={'id': '123'}, id='call2' + ) + intermediate_data = IntermediateData( + tool_uses=[tool_call1, tool_call2], tool_responses=[tool_response1] + ) + assert get_all_tool_calls_with_responses(intermediate_data) == [ + (tool_call1, tool_response1), + (tool_call2, None), + ] + + +def test_get_all_tool_calls_with_responses_with_steps_no_tool_calls(): + """Tests get_all_tool_calls_with_responses with Steps that don't have tool calls.""" + # No tool calls + intermediate_data = InvocationEvents(invocation_events=[]) + assert get_all_tool_calls_with_responses(intermediate_data) == [] + + +def test_get_all_tool_calls_with_responses_with_invocation_events(): + """Tests get_all_tool_calls_with_responses with InvocationEvents.""" + # No tools + intermediate_data = InvocationEvents(invocation_events=[]) + assert get_all_tool_calls_with_responses(intermediate_data) == [] + + # With matching and non-matching tool calls + tool_call1 = genai_types.FunctionCall( + name='search', args={'query': 'weather'}, id='call1' + ) + tool_response1 = genai_types.FunctionResponse( + name='search', response={'result': 'sunny'}, id='call1' + ) + tool_call2 = genai_types.FunctionCall( + name='lookup', args={'id': '123'}, id='call2' + ) + invocation_event1 = InvocationEvent( + author='agent', + content=genai_types.Content( + parts=[ + genai_types.Part(function_call=tool_call1), + genai_types.Part(function_call=tool_call2), + ], + role='model', + ), + ) + invocation_event2 = InvocationEvent( + author='tool', + content=genai_types.Content( + parts=[genai_types.Part(function_response=tool_response1)], + role='tool', + ), + ) + intermediate_data = InvocationEvents( + invocation_events=[invocation_event1, invocation_event2] + ) + assert get_all_tool_calls_with_responses(intermediate_data) == [ + (tool_call1, tool_response1), + (tool_call2, None), + ] diff --git a/tests/unittests/evaluation/test_final_response_match_v2.py b/tests/unittests/evaluation/test_final_response_match_v2.py index 382b7a7d..a40dbe09 100644 --- a/tests/unittests/evaluation/test_final_response_match_v2.py +++ b/tests/unittests/evaluation/test_final_response_match_v2.py @@ -17,12 +17,13 @@ from __future__ import annotations from google.adk.evaluation.eval_case import Invocation from google.adk.evaluation.eval_metrics import BaseCriterion from google.adk.evaluation.eval_metrics import EvalMetric +from google.adk.evaluation.eval_metrics import EvalStatus from google.adk.evaluation.eval_metrics import JudgeModelOptions from google.adk.evaluation.eval_metrics import PrebuiltMetrics -from google.adk.evaluation.evaluator import EvalStatus from google.adk.evaluation.evaluator import PerInvocationResult from google.adk.evaluation.final_response_match_v2 import _parse_critique from google.adk.evaluation.final_response_match_v2 import FinalResponseMatchV2Evaluator +from google.adk.evaluation.llm_as_judge import AutoRaterScore from google.adk.evaluation.llm_as_judge_utils import Label from google.adk.models.llm_response import LlmResponse from google.genai import types as genai_types @@ -206,8 +207,10 @@ def test_convert_auto_rater_response_to_score_valid(): role="model", ) ) - score = evaluator.convert_auto_rater_response_to_score(llm_response) - assert score == 1.0 + auto_rater_score = evaluator.convert_auto_rater_response_to_score( + llm_response + ) + assert auto_rater_score == AutoRaterScore(score=1.0) def test_convert_auto_rater_response_to_score_invalid(): @@ -224,8 +227,10 @@ def test_convert_auto_rater_response_to_score_invalid(): role="model", ) ) - score = evaluator.convert_auto_rater_response_to_score(llm_response) - assert score == 0.0 + auto_rater_score = evaluator.convert_auto_rater_response_to_score( + llm_response + ) + assert auto_rater_score == AutoRaterScore(score=0.0) def test_convert_auto_rater_response_to_score_invalid_json(): @@ -236,8 +241,10 @@ def test_convert_auto_rater_response_to_score_invalid_json(): role="model", ) ) - score = evaluator.convert_auto_rater_response_to_score(llm_response) - assert score is None + auto_rater_score = evaluator.convert_auto_rater_response_to_score( + llm_response + ) + assert auto_rater_score == AutoRaterScore() def test_convert_auto_rater_response_to_score_missing_key(): @@ -248,8 +255,10 @@ def test_convert_auto_rater_response_to_score_missing_key(): role="model", ) ) - score = evaluator.convert_auto_rater_response_to_score(llm_response) - assert score is None + auto_rater_score = evaluator.convert_auto_rater_response_to_score( + llm_response + ) + assert auto_rater_score == AutoRaterScore() def test_aggregate_per_invocation_samples_none_evaluated(): diff --git a/tests/unittests/evaluation/test_llm_as_judge.py b/tests/unittests/evaluation/test_llm_as_judge.py index d03d88b2..6618e6c1 100644 --- a/tests/unittests/evaluation/test_llm_as_judge.py +++ b/tests/unittests/evaluation/test_llm_as_judge.py @@ -24,6 +24,7 @@ from google.adk.evaluation.eval_metrics import LlmAsAJudgeCriterion from google.adk.evaluation.evaluator import EvalStatus from google.adk.evaluation.evaluator import EvaluationResult from google.adk.evaluation.evaluator import PerInvocationResult +from google.adk.evaluation.llm_as_judge import AutoRaterScore from google.adk.evaluation.llm_as_judge import LlmAsJudge from google.adk.evaluation.llm_as_judge_utils import get_eval_status from google.adk.evaluation.llm_as_judge_utils import get_text_from_content @@ -41,8 +42,8 @@ class MockLlmAsJudge(LlmAsJudge): def convert_auto_rater_response_to_score( self, llm_response: LlmResponse - ) -> Optional[float]: - return 1.0 + ) -> AutoRaterScore: + return AutoRaterScore(score=1.0) def aggregate_per_invocation_samples( self, diff --git a/tests/unittests/evaluation/test_llm_as_judge_utils.py b/tests/unittests/evaluation/test_llm_as_judge_utils.py new file mode 100644 index 00000000..2e3472f5 --- /dev/null +++ b/tests/unittests/evaluation/test_llm_as_judge_utils.py @@ -0,0 +1,290 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import json + +from google.adk.evaluation.app_details import AgentDetails +from google.adk.evaluation.app_details import AppDetails +from google.adk.evaluation.eval_case import IntermediateData +from google.adk.evaluation.eval_case import InvocationEvent +from google.adk.evaluation.eval_case import InvocationEvents +from google.adk.evaluation.eval_rubrics import RubricScore +from google.adk.evaluation.evaluator import EvalStatus +from google.adk.evaluation.llm_as_judge_utils import get_average_rubric_score +from google.adk.evaluation.llm_as_judge_utils import get_eval_status +from google.adk.evaluation.llm_as_judge_utils import get_text_from_content +from google.adk.evaluation.llm_as_judge_utils import get_tool_calls_and_responses_as_json_str +from google.adk.evaluation.llm_as_judge_utils import get_tool_declarations_as_json_str +from google.genai import types as genai_types + + +def test_get_text_from_content_with_none(): + """Tests get_text_from_content with None as input.""" + assert get_text_from_content(None) is None + + +def test_get_text_from_content_with_content_and_none_parts(): + """Tests get_text_from_content with Content that has None for parts.""" + content = genai_types.Content(parts=None) + assert get_text_from_content(content) is None + + +def test_get_text_from_content_with_empty_parts(): + """Tests get_text_from_content with an empty parts list.""" + content = genai_types.Content(parts=[]) + assert get_text_from_content(content) == None + + +def test_get_text_from_content_with_parts_but_no_text(): + """Tests get_text_from_content with parts that do not contain text.""" + content = genai_types.Content( + parts=[ + genai_types.Part( + function_call=genai_types.FunctionCall(name="test_func") + ) + ] + ) + assert get_text_from_content(content) == "" + + +def test_get_text_from_content_with_single_text_part(): + """Tests get_text_from_content with a single text part.""" + content = genai_types.Content(parts=[genai_types.Part(text="Hello")]) + assert get_text_from_content(content) == "Hello" + + +def test_get_text_from_content_with_multiple_text_parts(): + """Tests get_text_from_content with multiple text parts.""" + content = genai_types.Content( + parts=[genai_types.Part(text="Hello"), genai_types.Part(text="World")] + ) + assert get_text_from_content(content) == "Hello\nWorld" + + +def test_get_text_from_content_with_mixed_parts(): + """Tests get_text_from_content with a mix of text and non-text parts.""" + content = genai_types.Content( + parts=[ + genai_types.Part(text="Hello"), + genai_types.Part( + function_call=genai_types.FunctionCall(name="test_func") + ), + genai_types.Part(text="World"), + ] + ) + assert get_text_from_content(content) == "Hello\nWorld" + + +def test_get_eval_status_with_none_score(): + """Tests get_eval_status returns NOT_EVALUATED for a None score.""" + assert get_eval_status(score=None, threshold=0.5) == EvalStatus.NOT_EVALUATED + + +def test_get_eval_status_when_score_is_greater_than_threshold(): + """Tests get_eval_status returns PASSED when score > threshold.""" + assert get_eval_status(score=0.8, threshold=0.5) == EvalStatus.PASSED + + +def test_get_eval_status_when_score_is_equal_to_threshold(): + """Tests get_eval_status returns PASSED when score == threshold.""" + assert get_eval_status(score=0.5, threshold=0.5) == EvalStatus.PASSED + + +def test_get_eval_status_when_score_is_less_than_threshold(): + """Tests get_eval_status returns FAILED when score < threshold.""" + assert get_eval_status(score=0.4, threshold=0.5) == EvalStatus.FAILED + + +def test_get_average_rubric_score_with_empty_list(): + """Tests get_average_rubric_score returns None for an empty list.""" + assert get_average_rubric_score([]) is None + + +def test_get_average_rubric_score_with_all_none_scores(): + """Tests get_average_rubric_score returns None when all scores are None.""" + rubric_scores = [ + RubricScore(rubric_id="1", score=None), + RubricScore(rubric_id="2", score=None), + ] + assert get_average_rubric_score(rubric_scores) is None + + +def test_get_average_rubric_score_with_single_score(): + """Tests get_average_rubric_score with a single valid score.""" + rubric_scores = [RubricScore(rubric_id="1", score=0.8)] + assert get_average_rubric_score(rubric_scores) == 0.8 + + +def test_get_average_rubric_score_with_multiple_scores(): + """Tests get_average_rubric_score with multiple valid scores.""" + rubric_scores = [ + RubricScore(rubric_id="1", score=0.8), + RubricScore(rubric_id="2", score=0.6), + ] + assert get_average_rubric_score(rubric_scores) == 0.7 + + +def test_get_average_rubric_score_with_mixed_scores(): + """Tests get_average_rubric_score with a mix of valid and None scores.""" + rubric_scores = [ + RubricScore(rubric_id="1", score=0.8), + RubricScore(rubric_id="2", score=None), + RubricScore(rubric_id="3", score=0.6), + ] + assert get_average_rubric_score(rubric_scores) == 0.7 + + +def test_get_tool_declarations_as_json_str_with_no_agents(): + """Tests get_tool_declarations_as_json_str with no agents.""" + app_details = AppDetails(agent_details={}) + expected_json = {"tool_declarations": {}} + actual_json_str = get_tool_declarations_as_json_str(app_details) + assert json.loads(actual_json_str) == expected_json + + +def test_get_tool_declarations_as_json_str_with_agent_no_tools(): + """Tests get_tool_declarations_as_json_str with an agent that has no tools.""" + agent_details = {"agent1": AgentDetails(name="agent1", tool_declarations=[])} + app_details = AppDetails(agent_details=agent_details) + expected_json = {"tool_declarations": {"agent1": []}} + actual_json_str = get_tool_declarations_as_json_str(app_details) + assert json.loads(actual_json_str) == expected_json + + +def test_get_tool_declarations_as_json_str_with_agent_with_tools(): + """Tests get_tool_declarations_as_json_str with an agent that has tools.""" + tool1 = genai_types.Tool( + function_declarations=[ + genai_types.FunctionDeclaration( + name="test_func", description="A test function." + ) + ] + ) + agent_details = { + "agent1": AgentDetails(name="agent1", tool_declarations=[tool1]) + } + app_details = AppDetails(agent_details=agent_details) + expected_json = { + "tool_declarations": { + "agent1": [{ + "function_declarations": [{ + "name": "test_func", + "description": "A test function.", + }] + }] + } + } + actual_json_str = get_tool_declarations_as_json_str(app_details) + assert json.loads(actual_json_str) == expected_json + + +def test_get_tool_declarations_as_json_str_with_multiple_agents(): + """Tests get_tool_declarations_as_json_str with multiple agents.""" + tool1 = genai_types.Tool( + function_declarations=[ + genai_types.FunctionDeclaration( + name="test_func1", description="A test function 1." + ) + ] + ) + agent_details = { + "agent1": AgentDetails(name="agent1", tool_declarations=[tool1]), + "agent2": AgentDetails(name="agent2", tool_declarations=[]), + } + app_details = AppDetails(agent_details=agent_details) + expected_json = { + "tool_declarations": { + "agent1": [{ + "function_declarations": [{ + "name": "test_func1", + "description": "A test function 1.", + }] + }], + "agent2": [], + } + } + actual_json_str = get_tool_declarations_as_json_str(app_details) + assert json.loads(actual_json_str) == expected_json + + +def test_get_tool_calls_and_responses_as_json_str_with_none(): + """Tests get_tool_calls_and_responses_as_json_str with None.""" + assert ( + get_tool_calls_and_responses_as_json_str(None) + == "No intermediate steps were taken." + ) + + +def test_get_tool_calls_and_responses_as_json_str_with_intermediate_data_no_tools(): + """Tests get_tool_calls_and_responses_as_json_str with IntermediateData and no tools.""" + intermediate_data = IntermediateData(tool_uses=[], tool_responses=[]) + assert ( + get_tool_calls_and_responses_as_json_str(intermediate_data) + == "No intermediate steps were taken." + ) + + intermediate_data = InvocationEvents(invocation_events=[]) + assert ( + get_tool_calls_and_responses_as_json_str(intermediate_data) + == "No intermediate steps were taken." + ) + + +def test_get_tool_calls_and_responses_as_json_str_with_invocation_events_multiple_calls(): + """Tests get_tool_calls_and_responses_as_json_str with multiple calls in InvocationEvents.""" + tool_call1 = genai_types.FunctionCall(name="func1", args={}, id="call1") + tool_call2 = genai_types.FunctionCall(name="func2", args={}, id="call2") + tool_response1 = genai_types.FunctionResponse( + name="func1", response={"status": "ok"}, id="call1" + ) + invocation_event1 = InvocationEvent( + author="agent", + content=genai_types.Content( + parts=[ + genai_types.Part(function_call=tool_call1), + genai_types.Part(function_call=tool_call2), + ] + ), + ) + invocation_event2 = InvocationEvent( + author="tool", + content=genai_types.Content( + parts=[genai_types.Part(function_response=tool_response1)] + ), + ) + intermediate_data = InvocationEvents( + invocation_events=[invocation_event1, invocation_event2] + ) + json_str = get_tool_calls_and_responses_as_json_str(intermediate_data) + expected_json = { + "tool_calls_and_response": [ + { + "step": 0, + "tool_call": {"name": "func1", "args": {}, "id": "call1"}, + "tool_response": { + "name": "func1", + "response": {"status": "ok"}, + "id": "call1", + }, + }, + { + "step": 1, + "tool_call": {"name": "func2", "args": {}, "id": "call2"}, + "tool_response": "None", + }, + ] + } + assert json.loads(json_str) == expected_json diff --git a/tests/unittests/evaluation/test_rubric_based_final_response_quality_v1.py b/tests/unittests/evaluation/test_rubric_based_final_response_quality_v1.py new file mode 100644 index 00000000..01d119d9 --- /dev/null +++ b/tests/unittests/evaluation/test_rubric_based_final_response_quality_v1.py @@ -0,0 +1,650 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from google.adk.evaluation.app_details import AgentDetails +from google.adk.evaluation.app_details import AppDetails +from google.adk.evaluation.eval_case import IntermediateData +from google.adk.evaluation.eval_case import Invocation +from google.adk.evaluation.eval_case import InvocationEvent +from google.adk.evaluation.eval_case import InvocationEvents +from google.adk.evaluation.eval_metrics import EvalMetric +from google.adk.evaluation.eval_metrics import JudgeModelOptions +from google.adk.evaluation.eval_metrics import PrebuiltMetrics +from google.adk.evaluation.eval_metrics import RubricsBasedCriterion +from google.adk.evaluation.eval_rubrics import Rubric +from google.adk.evaluation.eval_rubrics import RubricContent +from google.adk.evaluation.eval_rubrics import RubricScore +from google.adk.evaluation.evaluator import EvalStatus +from google.adk.evaluation.evaluator import EvaluationResult +from google.adk.evaluation.evaluator import PerInvocationResult +from google.adk.evaluation.llm_as_judge_utils import get_average_rubric_score +from google.adk.evaluation.rubric_based_final_response_quality_v1 import _parse_auto_rater_response +from google.adk.evaluation.rubric_based_final_response_quality_v1 import RubricBasedFinalResponseQualityV1Evaluator +from google.adk.models.llm_response import LlmResponse +from google.genai import types as genai_types +import pytest + + +@pytest.fixture +def evaluator() -> RubricBasedFinalResponseQualityV1Evaluator: + """Returns a RubricBasedFinalResponseQualityV1Evaluator.""" + rubrics = [ + Rubric( + rubric_id="1", + rubric_content=RubricContent(text_property="Is the response good?"), + ), + Rubric( + rubric_id="2", + rubric_content=RubricContent(text_property="Is the response bad?"), + ), + ] + judge_model_options = JudgeModelOptions( + judge_model_config=None, + num_samples=3, + ) + criterion = RubricsBasedCriterion( + threshold=0.5, rubrics=rubrics, judge_model_options=judge_model_options + ) + metric = EvalMetric( + metric_name=PrebuiltMetrics.RUBRIC_BASED_FINAL_RESPONSE_QUALITY_V1.value, + threshold=0.5, + criterion=criterion, + ) + return RubricBasedFinalResponseQualityV1Evaluator(metric) + + +def _create_per_invocation_result( + rubric_scores: list[RubricScore], +) -> PerInvocationResult: + """Helper to create a PerInvocationResult.""" + return PerInvocationResult( + actual_invocation=Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text="part_1")] + ) + ), + expected_invocation=Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text="part_2")] + ) + ), + score=get_average_rubric_score(rubric_scores), + rubric_scores=rubric_scores, + eval_status=EvalStatus.NOT_EVALUATED, + ) + + +def test_format_auto_rater_prompt_with_basic_invocation( + evaluator: RubricBasedFinalResponseQualityV1Evaluator, +): + """Tests format_auto_rater_prompt with a basic invocation.""" + invocation = Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text="User input here.")] + ), + final_response=genai_types.Content( + parts=[genai_types.Part(text="Final agent response.")] + ), + ) + prompt = evaluator.format_auto_rater_prompt(invocation, None) + + assert "User input here." in prompt + assert "Final agent response." in prompt + assert "Is the response good?" in prompt + assert "Is the response bad?" in prompt + assert "\n \n " in prompt + assert ( + "\n Agent has no tools.\n " in prompt + ) + assert ( + "\n No intermediate steps were taken.\n " + " " + ) in prompt + + +def test_format_auto_rater_prompt_with_app_details( + evaluator: RubricBasedFinalResponseQualityV1Evaluator, +): + """Tests format_auto_rater_prompt with app_details in invocation.""" + tool = genai_types.Tool( + function_declarations=[ + genai_types.FunctionDeclaration( + name="test_func", description="A test function." + ) + ] + ) + app_details = AppDetails( + agent_details={ + "agent1": AgentDetails( + name="agent1", + instructions="This is an agent instruction.", + tool_declarations=[tool], + ) + }, + ) + invocation = Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text="User input here.")] + ), + final_response=genai_types.Content( + parts=[genai_types.Part(text="Final agent response.")] + ), + app_details=app_details, + intermediate_data=InvocationEvents( + invocation_events=[InvocationEvent(author="agent1", content=None)] + ), + ) + prompt = evaluator.format_auto_rater_prompt(invocation, None) + + assert "This is an agent instruction." in prompt + assert '"name": "test_func"' in prompt + assert '"description": "A test function."' in prompt + + +def test_format_auto_rater_prompt_with_intermediate_data( + evaluator: RubricBasedFinalResponseQualityV1Evaluator, +): + """Tests format_auto_rater_prompt with intermediate_data in invocation.""" + tool_call = genai_types.FunctionCall( + name="test_func", args={"arg1": "val1"}, id="call1" + ) + tool_response = genai_types.FunctionResponse( + name="test_func", response={"result": "ok"}, id="call1" + ) + intermediate_data = IntermediateData( + tool_uses=[tool_call], tool_responses=[tool_response] + ) + invocation = Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text="User input here.")] + ), + final_response=genai_types.Content( + parts=[genai_types.Part(text="Final agent response.")] + ), + intermediate_data=intermediate_data, + ) + prompt = evaluator.format_auto_rater_prompt(invocation, None) + + assert '"step": 0' in prompt + assert '"tool_call":' in prompt + assert '"name": "test_func"' in prompt + assert '"tool_response":' in prompt + assert '"result": "ok"' in prompt + + +def test_format_auto_rater_prompt_with_app_details_no_tools( + evaluator: RubricBasedFinalResponseQualityV1Evaluator, +): + """Tests format_auto_rater_prompt with app_details but no tools.""" + app_details = AppDetails( + agent_details={ + "agent1": AgentDetails(name="agent1", tool_declarations=[]) + }, + ) + invocation = Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text="User input here.")] + ), + final_response=genai_types.Content( + parts=[genai_types.Part(text="Final agent response.")] + ), + app_details=app_details, + ) + prompt = evaluator.format_auto_rater_prompt(invocation, None) + + assert '"tool_declarations": {\n "agent1": []\n }' in prompt + + +def test_format_auto_rater_prompt_with_intermediate_data_no_tools( + evaluator: RubricBasedFinalResponseQualityV1Evaluator, +): + """Tests format_auto_rater_prompt with intermediate_data but no tool calls.""" + intermediate_data = IntermediateData(tool_uses=[], tool_responses=[]) + invocation = Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text="User input here.")] + ), + final_response=genai_types.Content( + parts=[genai_types.Part(text="Final agent response.")] + ), + intermediate_data=intermediate_data, + ) + prompt = evaluator.format_auto_rater_prompt(invocation, None) + + assert "No intermediate steps were taken." in prompt + + +def test_parse_auto_rater_response_with_empty_string(): + """Tests _parse_auto_rater_response with an empty string.""" + assert _parse_auto_rater_response("") == [] + + +def test_parse_auto_rater_response_with_malformed_string(): + """Tests _parse_auto_rater_response with a malformed string.""" + response = "This is just some random text without the expected format." + assert _parse_auto_rater_response(response) == [] + + +def test_parse_auto_rater_response_with_single_yes_verdict(): + """Tests _parse_auto_rater_response with a single 'yes' verdict.""" + response = """ + Property: Is the response good? + Rationale: It was good. + Verdict: yes + """ + parsed = _parse_auto_rater_response(response) + assert len(parsed) == 1 + assert parsed[0].property_text == "Is the response good?" + assert parsed[0].rationale == "It was good." + assert parsed[0].score == 1.0 + + +def test_parse_auto_rater_response_with_single_no_verdict(): + """Tests _parse_auto_rater_response with a single 'no' verdict.""" + response = """ + Property: Is the response bad? + Rationale: It was bad. + Verdict: no + """ + parsed = _parse_auto_rater_response(response) + assert len(parsed) == 1 + assert parsed[0].property_text == "Is the response bad?" + assert parsed[0].rationale == "It was bad." + assert parsed[0].score == 0.0 + + +def test_parse_auto_rater_response_with_invalid_verdict(): + """Tests _parse_auto_rater_response with an invalid verdict.""" + response = """ + Property: Is it unclear? + Rationale: I cannot tell. + Verdict: maybe + """ + parsed = _parse_auto_rater_response(response) + assert len(parsed) == 1 + assert parsed[0].property_text == "Is it unclear?" + assert parsed[0].rationale == "I cannot tell." + assert parsed[0].score is None + + +def test_parse_auto_rater_response_with_multiple_verdicts(): + """Tests _parse_auto_rater_response with multiple verdicts.""" + response = """ + Property: Is the response good? + Rationale: It was good. + Verdict: yes + + Property: Is the response bad? + Rationale: It was not bad. + Verdict: no + """ + parsed = _parse_auto_rater_response(response) + assert len(parsed) == 2 + assert parsed[0].property_text == "Is the response good?" + assert parsed[0].rationale == "It was good." + assert parsed[0].score == 1.0 + assert parsed[1].property_text == "Is the response bad?" + assert parsed[1].rationale == "It was not bad." + assert parsed[1].score == 0.0 + + +def test_parse_auto_rater_response_with_incomplete_entry(): + """Tests _parse_auto_rater_response with an incomplete entry.""" + response = """ + Property: Is the response good? + Rationale: It was good. + Verdict: yes + + Property: Is the response bad? + Rationale: It was not bad. + """ # Missing Verdict + parsed = _parse_auto_rater_response(response) + assert len(parsed) == 1 # zip will only create one item + assert parsed[0].property_text == "Is the response good?" + + +def test_parse_auto_rater_response_with_case_insensitive_verdict(): + """Tests _parse_auto_rater_response is case-insensitive for verdicts.""" + response = """ + Property: Is the response good? + Rationale: It was good. + Verdict: Yes + Property: Is the response bad? + Rationale: It was bad. + Verdict: NO + """ + parsed = _parse_auto_rater_response(response) + assert len(parsed) == 2 + assert parsed[0].score == 1.0 + assert parsed[1].score == 0.0 + + +def test_convert_auto_rater_response_to_score_with_empty_response( + evaluator: RubricBasedFinalResponseQualityV1Evaluator, +): + """Tests convert_auto_rater_response_to_score with an empty response.""" + response = LlmResponse( + content=genai_types.Content(parts=[genai_types.Part(text="")]) + ) + auto_rater_score = evaluator.convert_auto_rater_response_to_score(response) + assert auto_rater_score.score is None + assert auto_rater_score.rubric_scores == [] + + +def test_convert_auto_rater_response_to_score_with_malformed_response( + evaluator: RubricBasedFinalResponseQualityV1Evaluator, +): + """Tests convert_auto_rater_response_to_score with a malformed response.""" + response = LlmResponse( + content=genai_types.Content( + parts=[genai_types.Part(text="This is not a valid format.")] + ) + ) + auto_rater_score = evaluator.convert_auto_rater_response_to_score(response) + assert auto_rater_score.score is None + assert auto_rater_score.rubric_scores == [] + + +def test_convert_auto_rater_response_to_score_with_mixed_verdicts( + evaluator: RubricBasedFinalResponseQualityV1Evaluator, +): + """Tests convert_auto_rater_response_to_score with mixed verdicts.""" + response_text = """ + Property: Is the response good? + Rationale: It was good. + Verdict: yes + Property: Is the response bad? + Rationale: It was bad. + Verdict: no + """ + response = LlmResponse( + content=genai_types.Content(parts=[genai_types.Part(text=response_text)]) + ) + auto_rater_score = evaluator.convert_auto_rater_response_to_score(response) + assert auto_rater_score.score == 0.5 + assert len(auto_rater_score.rubric_scores) == 2 + assert auto_rater_score.rubric_scores[0].score == 1.0 + assert auto_rater_score.rubric_scores[1].score == 0.0 + + +def test_convert_auto_rater_response_to_score_with_invalid_verdict( + evaluator: RubricBasedFinalResponseQualityV1Evaluator, +): + """Tests convert_auto_rater_response_to_score with an invalid verdict.""" + response_text = """ + Property: Is the response good? + Rationale: It was good. + Verdict: yes + Property: Is the response bad? + Rationale: I cannot tell. + Verdict: invalid + """ + response = LlmResponse( + content=genai_types.Content(parts=[genai_types.Part(text=response_text)]) + ) + auto_rater_score = evaluator.convert_auto_rater_response_to_score(response) + assert auto_rater_score.score == 1.0 + assert len(auto_rater_score.rubric_scores) == 2 + assert auto_rater_score.rubric_scores[0].score == 1.0 + assert auto_rater_score.rubric_scores[1].score is None + + +def test_convert_auto_rater_response_to_score_with_unknown_property( + evaluator: RubricBasedFinalResponseQualityV1Evaluator, +): + """Tests convert_auto_rater_response_to_score with an unknown property.""" + response_text = """ + Property: Is the response amazing? + Rationale: It was amazing. + Verdict: yes + """ + response = LlmResponse( + content=genai_types.Content(parts=[genai_types.Part(text=response_text)]) + ) + auto_rater_score = evaluator.convert_auto_rater_response_to_score(response) + assert auto_rater_score.score is None + assert len(auto_rater_score.rubric_scores) == 0 + + +def test_aggregate_per_invocation_samples_with_no_rubric_scores( + evaluator: RubricBasedFinalResponseQualityV1Evaluator, +): + """Tests aggregation when samples have no rubric scores.""" + samples = [ + _create_per_invocation_result([]), + _create_per_invocation_result([]), + ] + result = evaluator.aggregate_per_invocation_samples(samples) + assert result.score is None + assert result.rubric_scores == [] + + +def test_aggregate_per_invocation_samples_with_majority_positive( + evaluator: RubricBasedFinalResponseQualityV1Evaluator, +): + """Tests aggregation with a majority of positive scores.""" + samples = [ + _create_per_invocation_result([RubricScore(rubric_id="1", score=1.0)]), + _create_per_invocation_result([RubricScore(rubric_id="1", score=1.0)]), + _create_per_invocation_result([RubricScore(rubric_id="1", score=0.0)]), + ] + result = evaluator.aggregate_per_invocation_samples(samples) + assert result.score == 1.0 + assert len(result.rubric_scores) == 1 + assert result.rubric_scores[0].rubric_id == "1" + assert result.rubric_scores[0].score == 1.0 + + +def test_aggregate_per_invocation_samples_with_majority_negative( + evaluator: RubricBasedFinalResponseQualityV1Evaluator, +): + """Tests aggregation with a majority of negative scores.""" + samples = [ + _create_per_invocation_result([RubricScore(rubric_id="1", score=1.0)]), + _create_per_invocation_result([RubricScore(rubric_id="1", score=0.0)]), + _create_per_invocation_result([RubricScore(rubric_id="1", score=0.0)]), + ] + result = evaluator.aggregate_per_invocation_samples(samples) + assert result.score == 0.0 + assert len(result.rubric_scores) == 1 + assert result.rubric_scores[0].rubric_id == "1" + assert result.rubric_scores[0].score == 0.0 + + +def test_aggregate_per_invocation_samples_with_tie_verdicts( + evaluator: RubricBasedFinalResponseQualityV1Evaluator, +): + """Tests aggregation with a tie, where negative should win.""" + samples = [ + _create_per_invocation_result([RubricScore(rubric_id="1", score=1.0)]), + _create_per_invocation_result([RubricScore(rubric_id="1", score=0.0)]), + ] + result = evaluator.aggregate_per_invocation_samples(samples) + assert result.score == 0.0 + assert len(result.rubric_scores) == 1 + assert result.rubric_scores[0].rubric_id == "1" + assert result.rubric_scores[0].score == 0.0 + + +def test_aggregate_per_invocation_samples_with_all_none_scores( + evaluator: RubricBasedFinalResponseQualityV1Evaluator, +): + """Tests aggregation when all samples have a score of None.""" + samples = [ + _create_per_invocation_result( + [RubricScore(rubric_id="1", score=None, rationale="r1")] + ), + _create_per_invocation_result( + [RubricScore(rubric_id="1", score=None, rationale="r2")] + ), + ] + result = evaluator.aggregate_per_invocation_samples(samples) + assert result.score is None + assert len(result.rubric_scores) == 1 + assert result.rubric_scores[0].rubric_id == "1" + assert result.rubric_scores[0].score is None + assert result.rubric_scores[0].rationale == "r1" + + +def test_aggregate_per_invocation_samples_with_multiple_rubrics( + evaluator: RubricBasedFinalResponseQualityV1Evaluator, +): + """Tests aggregation with multiple rubrics.""" + samples = [ + _create_per_invocation_result([ + RubricScore(rubric_id="1", score=1.0), + RubricScore(rubric_id="2", score=0.0), + ]), + _create_per_invocation_result([ + RubricScore(rubric_id="1", score=1.0), + RubricScore(rubric_id="2", score=0.0), + ]), + _create_per_invocation_result([ + RubricScore(rubric_id="1", score=0.0), + RubricScore(rubric_id="2", score=1.0), + ]), + ] + result = evaluator.aggregate_per_invocation_samples(samples) + assert result.score == 0.5 + assert len(result.rubric_scores) == 2 + rubric1_score = next( + (s for s in result.rubric_scores if s.rubric_id == "1"), None + ) + rubric2_score = next( + (s for s in result.rubric_scores if s.rubric_id == "2"), None + ) + assert rubric1_score is not None + assert rubric1_score.score == 1.0 + assert rubric2_score is not None + assert rubric2_score.score == 0.0 + + +def test_aggregate_invocation_results_with_empty_list( + evaluator: RubricBasedFinalResponseQualityV1Evaluator, +): + """Tests aggregate_invocation_results with an empty list.""" + result = evaluator.aggregate_invocation_results([]) + assert isinstance(result, EvaluationResult) + assert result.overall_score is None + assert result.overall_rubric_scores == [] + assert result.per_invocation_results == [] + + +def test_aggregate_invocation_results_with_no_rubric_scores( + evaluator: RubricBasedFinalResponseQualityV1Evaluator, +): + """Tests aggregate_invocation_results with samples that have no rubric scores.""" + invocations = [ + _create_per_invocation_result([]), + _create_per_invocation_result([]), + ] + result = evaluator.aggregate_invocation_results(invocations) + assert result.overall_score is None + assert result.overall_rubric_scores == [] + assert result.per_invocation_results == invocations + + +def test_aggregate_invocation_results_with_single_invocation( + evaluator: RubricBasedFinalResponseQualityV1Evaluator, +): + """Tests aggregate_invocation_results with a single invocation result.""" + invocations = [ + _create_per_invocation_result([ + RubricScore(rubric_id="1", score=1.0), + RubricScore(rubric_id="2", score=0.0), + ]) + ] + result = evaluator.aggregate_invocation_results(invocations) + assert result.overall_score == 0.5 + assert len(result.overall_rubric_scores) == 2 + rubric1_score = next( + s for s in result.overall_rubric_scores if s.rubric_id == "1" + ) + rubric2_score = next( + s for s in result.overall_rubric_scores if s.rubric_id == "2" + ) + assert rubric1_score.score == 1.0 + assert rubric2_score.score == 0.0 + + +def test_aggregate_invocation_results_with_multiple_invocations_single_rubric( + evaluator: RubricBasedFinalResponseQualityV1Evaluator, +): + """Tests aggregate_invocation_results with multiple invocations for a single rubric.""" + invocations = [ + _create_per_invocation_result([RubricScore(rubric_id="1", score=1.0)]), + _create_per_invocation_result([RubricScore(rubric_id="1", score=0.0)]), + _create_per_invocation_result([RubricScore(rubric_id="1", score=1.0)]), + ] + result = evaluator.aggregate_invocation_results(invocations) + assert result.overall_score == pytest.approx(2 / 3) + assert len(result.overall_rubric_scores) == 1 + assert result.overall_rubric_scores[0].rubric_id == "1" + assert result.overall_rubric_scores[0].score == pytest.approx(2 / 3) + + +def test_aggregate_invocation_results_with_multiple_invocations_and_rubrics( + evaluator: RubricBasedFinalResponseQualityV1Evaluator, +): + """Tests aggregate_invocation_results with multiple invocations and rubrics.""" + invocations = [ + _create_per_invocation_result([ + RubricScore(rubric_id="1", score=1.0), + RubricScore(rubric_id="2", score=0.0), + ]), + _create_per_invocation_result([ + RubricScore(rubric_id="1", score=0.0), + RubricScore(rubric_id="2", score=1.0), + ]), + ] + result = evaluator.aggregate_invocation_results(invocations) + assert result.overall_score == 0.5 + assert len(result.overall_rubric_scores) == 2 + rubric1_score = next( + s for s in result.overall_rubric_scores if s.rubric_id == "1" + ) + rubric2_score = next( + s for s in result.overall_rubric_scores if s.rubric_id == "2" + ) + assert rubric1_score.score == 0.5 + assert rubric2_score.score == 0.5 + + +def test_aggregate_invocation_results_with_none_scores( + evaluator: RubricBasedFinalResponseQualityV1Evaluator, +): + """Tests aggregate_invocation_results with some None scores.""" + invocations = [ + _create_per_invocation_result([ + RubricScore(rubric_id="1", score=1.0), + RubricScore(rubric_id="2", score=None), + ]), + _create_per_invocation_result([ + RubricScore(rubric_id="1", score=0.0), + RubricScore(rubric_id="2", score=1.0), + ]), + ] + result = evaluator.aggregate_invocation_results(invocations) + assert result.overall_score == pytest.approx(2 / 3) + assert len(result.overall_rubric_scores) == 2 + rubric1_score = next( + s for s in result.overall_rubric_scores if s.rubric_id == "1" + ) + rubric2_score = next( + s for s in result.overall_rubric_scores if s.rubric_id == "2" + ) + assert rubric1_score.score == 0.5 + assert rubric2_score.score == 1.0