diff --git a/src/google/adk/cli/cli_eval.py b/src/google/adk/cli/cli_eval.py index 7ccc8252..6914125d 100644 --- a/src/google/adk/cli/cli_eval.py +++ b/src/google/adk/cli/cli_eval.py @@ -37,6 +37,8 @@ from ..evaluation.base_eval_service import InferenceRequest from ..evaluation.base_eval_service import InferenceResult from ..evaluation.constants import MISSING_EVAL_DEPENDENCIES_MESSAGE from ..evaluation.eval_case import EvalCase +from ..evaluation.eval_config import BaseCriterion +from ..evaluation.eval_config import EvalConfig from ..evaluation.eval_metrics import EvalMetric from ..evaluation.eval_metrics import EvalMetricResult from ..evaluation.eval_metrics import EvalMetricResultPerInvocation @@ -64,6 +66,10 @@ DEFAULT_CRITERIA = { RESPONSE_MATCH_SCORE_KEY: 0.8, } +_DEFAULT_EVAL_CONFIG = EvalConfig( + criteria={"tool_trajectory_avg_score": 1.0, "response_match_score": 0.8} +) + def _import_from_path(module_name, file_path): spec = importlib.util.spec_from_file_location(module_name, file_path) @@ -81,27 +87,48 @@ def _get_agent_module(agent_module_file_path: str): def get_evaluation_criteria_or_default( eval_config_file_path: str, -) -> dict[str, float]: - """Returns evaluation criteria from the config file, if present. +) -> EvalConfig: + """Returns EvalConfig read from the config file, if present. Otherwise a default one is returned. """ if eval_config_file_path: with open(eval_config_file_path, "r", encoding="utf-8") as f: - config_data = json.load(f) + content = f.read() + return EvalConfig.model_validate_json(content) - if "criteria" in config_data and isinstance(config_data["criteria"], dict): - evaluation_criteria = config_data["criteria"] - else: - raise ValueError( - f"Invalid format for test_config.json at {eval_config_file_path}." - " Expected a 'criteria' dictionary." - ) - else: - logger.info("No config file supplied. Using default criteria.") - evaluation_criteria = DEFAULT_CRITERIA + logger.info("No config file supplied. Using default criteria.") + return _DEFAULT_EVAL_CONFIG - return evaluation_criteria + +def get_eval_metrics_from_config(eval_config: EvalConfig) -> list[EvalMetric]: + """Returns a list of EvalMetrics mapped from the EvalConfig.""" + eval_metric_list = [] + if eval_config.criteria: + for metric_name, criterion in eval_config.criteria.items(): + if isinstance(criterion, float): + eval_metric_list.append( + EvalMetric( + metric_name=metric_name, + threshold=criterion, + criterion=BaseCriterion(threshold=criterion), + ) + ) + elif isinstance(criterion, BaseCriterion): + eval_metric_list.append( + EvalMetric( + metric_name=metric_name, + threshold=criterion.threshold, + criterion=criterion, + ) + ) + else: + raise ValueError( + f"Unexpected criterion type. {type(criterion).__name__} not" + " supported." + ) + + return eval_metric_list def get_root_agent(agent_module_file_path: str) -> Agent: diff --git a/src/google/adk/cli/cli_tools_click.py b/src/google/adk/cli/cli_tools_click.py index c45fdd37..019b3284 100644 --- a/src/google/adk/cli/cli_tools_click.py +++ b/src/google/adk/cli/cli_tools_click.py @@ -382,24 +382,16 @@ def cli_eval( from ..evaluation.local_eval_sets_manager import LocalEvalSetsManager from .cli_eval import _collect_eval_results from .cli_eval import _collect_inferences + from .cli_eval import get_eval_metrics_from_config from .cli_eval import get_evaluation_criteria_or_default from .cli_eval import get_root_agent from .cli_eval import parse_and_get_evals_to_run except ModuleNotFoundError as mnf: raise click.ClickException(MISSING_EVAL_DEPENDENCIES_MESSAGE) from mnf - evaluation_criteria = get_evaluation_criteria_or_default(config_file_path) - eval_metrics = [] - for metric_name, threshold in evaluation_criteria.items(): - eval_metrics.append( - EvalMetric( - metric_name=metric_name, - threshold=threshold, - judge_model_options=JudgeModelOptions(), - ) - ) - - print(f"Using evaluation criteria: {evaluation_criteria}") + eval_config = get_evaluation_criteria_or_default(config_file_path) + print(f"Using evaluation criteria: {eval_config}") + eval_metrics = get_eval_metrics_from_config(eval_config) root_agent = get_root_agent(agent_module_file_path) app_name = os.path.basename(agent_module_file_path) @@ -500,7 +492,9 @@ def cli_eval( except ModuleNotFoundError as mnf: raise click.ClickException(MISSING_EVAL_DEPENDENCIES_MESSAGE) from mnf - print("*********************************************************************") + click.echo( + "*********************************************************************" + ) eval_run_summary = {} for eval_result in eval_results: @@ -513,9 +507,9 @@ def cli_eval( eval_run_summary[eval_result.eval_set_id][0] += 1 else: eval_run_summary[eval_result.eval_set_id][1] += 1 - print("Eval Run Summary") + click.echo("Eval Run Summary") for eval_set_id, pass_fail_count in eval_run_summary.items(): - print( + click.echo( f"{eval_set_id}:\n Tests passed: {pass_fail_count[0]}\n Tests" f" failed: {pass_fail_count[1]}" ) @@ -523,10 +517,17 @@ def cli_eval( if print_detailed_results: for eval_result in eval_results: eval_result: EvalCaseResult - print( + click.echo( "*********************************************************************" ) - print(eval_result.model_dump_json(indent=2)) + click.echo( + eval_result.model_dump_json( + indent=2, + exclude_unset=True, + exclude_defaults=True, + exclude_none=True, + ) + ) def adk_services_options(): @@ -1010,7 +1011,8 @@ def cli_deploy_cloud_run( adk deploy cloud_run --project=[project] --region=[region] path/to/my_agent - adk deploy cloud_run --project=[project] --region=[region] path/to/my_agent -- --no-allow-unauthenticated --min-instances=2 + adk deploy cloud_run --project=[project] --region=[region] path/to/my_agent + -- --no-allow-unauthenticated --min-instances=2 """ if verbosity: click.secho( @@ -1222,7 +1224,8 @@ def cli_deploy_agent_engine( Example: adk deploy agent_engine --project=[project] --region=[region] - --staging_bucket=[staging_bucket] --display_name=[app_name] path/to/my_agent + --staging_bucket=[staging_bucket] --display_name=[app_name] + path/to/my_agent """ try: cli_deploy.to_agent_engine( @@ -1367,7 +1370,8 @@ def cli_deploy_gke( Example: - adk deploy gke --project=[project] --region=[region] --cluster_name=[cluster_name] path/to/my_agent + adk deploy gke --project=[project] --region=[region] + --cluster_name=[cluster_name] path/to/my_agent """ try: cli_deploy.to_gke( diff --git a/src/google/adk/evaluation/common.py b/src/google/adk/evaluation/common.py new file mode 100644 index 00000000..3f349d57 --- /dev/null +++ b/src/google/adk/evaluation/common.py @@ -0,0 +1,26 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import pydantic +from pydantic import alias_generators + + +class EvalBaseModel(pydantic.BaseModel): + model_config = pydantic.ConfigDict( + alias_generator=alias_generators.to_camel, + populate_by_name=True, + extra='forbid', + ) diff --git a/src/google/adk/evaluation/eval_config.py b/src/google/adk/evaluation/eval_config.py new file mode 100644 index 00000000..cc2de90e --- /dev/null +++ b/src/google/adk/evaluation/eval_config.py @@ -0,0 +1,66 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import Union + +from pydantic import alias_generators +from pydantic import BaseModel +from pydantic import ConfigDict +from pydantic import Field + +from .eval_metrics import BaseCriterion +from .eval_metrics import Threshold + + +class EvalConfig(BaseModel): + """Configurations needed to run an Eval. + + Allows users to specify metrics, their thresholds and other properties. + """ + + model_config = ConfigDict( + alias_generator=alias_generators.to_camel, + populate_by_name=True, + ) + + criteria: dict[str, Union[Threshold, BaseCriterion]] = Field( + default_factory=dict, + description="""A dictionary that maps criterion to be used for a metric. + +The key of the dictionary is the name of the eval metric and the value is the +criterion to be used. + +In the sample below, `tool_trajectory_avg_score`, `response_match_score` and +`final_response_match_v2` are the standard eval metric names, represented as +keys in the dictionary. The values in the dictionary are the corresponding +criterions. For the first two metrics, we use simple threshold as the criterion, +the third one uses `LlmAsAJudgeCriterion`. +{ + "criteria": { + "tool_trajectory_avg_score": 1.0, + "response_match_score": 0.5, + "final_response_match_v2": { + "threshold": 0.5, + "judge_model_options": { + "judge_model": "my favorite LLM", + "num_samples": 5 + } + } + }, + } +} +""", + ) diff --git a/src/google/adk/evaluation/eval_metrics.py b/src/google/adk/evaluation/eval_metrics.py index d73ce1e6..66f7299f 100644 --- a/src/google/adk/evaluation/eval_metrics.py +++ b/src/google/adk/evaluation/eval_metrics.py @@ -25,8 +25,16 @@ from pydantic import ConfigDict from pydantic import Field from typing_extensions import TypeAlias +from .common import EvalBaseModel from .eval_case import Invocation -from .evaluator import EvalStatus +from .eval_rubrics import Rubric +from .eval_rubrics import RubricScore + + +class EvalStatus(Enum): + PASSED = 1 + FAILED = 2 + NOT_EVALUATED = 3 class PrebuiltMetrics(Enum): @@ -42,9 +50,10 @@ class PrebuiltMetrics(Enum): MetricName: TypeAlias = Union[str, PrebuiltMetrics] +Threshold: TypeAlias = float -class JudgeModelOptions(BaseModel): +class JudgeModelOptions(EvalBaseModel): """Options for an eval metric's judge model.""" judge_model: str = Field( @@ -55,27 +64,69 @@ class JudgeModelOptions(BaseModel): ) judge_model_config: Optional[genai_types.GenerateContentConfig] = Field( - default=None, + default=genai_types.GenerateContentConfig, description="The configuration for the judge model.", ) - num_samples: Optional[int] = Field( - default=None, + num_samples: int = Field( + default=5, description=( "The number of times to sample the model for each invocation" - " evaluation." + " evaluation. Given that models tend to have certain degree of" + " unreliability to them, we repeatedly sample them with the same" + " data. These repeated invocation are them aggregated using some" + " strategy. From experimentation, we have found 5 to be a good" + " default." ), ) -class EvalMetric(BaseModel): - """A metric used to evaluate a particular aspect of an eval case.""" +class BaseCriterion(BaseModel): + """Base creterion to use for an Eval Metric.""" model_config = ConfigDict( alias_generator=alias_generators.to_camel, populate_by_name=True, + extra="allow", ) + threshold: Threshold = Field( + description="The threshold to be used by the metric.", + ) + + +class LlmAsAJudgeCriterion(BaseCriterion): + """Criterion when using LLM-As-A-Judge metric.""" + + judge_model_options: JudgeModelOptions = Field( + default_factory=JudgeModelOptions, + description="Options for the judge model.", + ) + + +class RubricsBasedCriterion(BaseCriterion): + """Criterion when using a rubric based metric.""" + + judge_model_options: JudgeModelOptions = Field( + default_factory=JudgeModelOptions, + description="Options for the judge model.", + ) + + rubrics: list[Rubric] = Field( + default_factory=list, + description=( + "Rubrics to be used by Metric. Not all metrics rely on rubrics, but" + " metrics like `rubric_based_final_response_quality_v1` do. Metrics" + " that don't use Rubrics, will just ignore this field, if specified." + " Metrics that do use rubrics will raise an execption, if they are" + " not specified." + ), + ) + + +class EvalMetric(EvalBaseModel): + """A metric used to evaluate a particular aspect of an eval case.""" + metric_name: str = Field( description="The name of the metric.", ) @@ -88,19 +139,33 @@ class EvalMetric(BaseModel): ) judge_model_options: Optional[JudgeModelOptions] = Field( + deprecated=True, default=None, - description="Options for the judge model.", + description=( + "[DEPRECATED] This field is deprecated in favor of `criterion`." + " Depending on the metric you may want to one of the sub-classes of" + " BaseCriterion." + ), + ) + + criterion: Optional[BaseCriterion] = Field( + default=None, description="""Evaluation criterion used by the metric.""" + ) + + +class EvalMetricResultDetails(EvalBaseModel): + rubric_scores: Optional[list[RubricScore]] = Field( + default=None, + description=( + "The scores obtained after applying the rubrics to the Agent's" + " response." + ), ) class EvalMetricResult(EvalMetric): """The actual computed score/value of a particular EvalMetric.""" - model_config = ConfigDict( - alias_generator=alias_generators.to_camel, - populate_by_name=True, - ) - score: Optional[float] = Field( default=None, description=( @@ -108,17 +173,17 @@ class EvalMetricResult(EvalMetric): " might not have happened." ), ) + eval_status: EvalStatus = Field(description="The status of this evaluation.") - -class EvalMetricResultPerInvocation(BaseModel): - """Eval metric results per invocation.""" - - model_config = ConfigDict( - alias_generator=alias_generators.to_camel, - populate_by_name=True, + details: EvalMetricResultDetails = Field( + default_factory=EvalMetricResultDetails, description="""""" ) + +class EvalMetricResultPerInvocation(EvalBaseModel): + """Eval metric results per invocation.""" + actual_invocation: Invocation = Field( description=( "The actual invocation, usually obtained by inferencing the agent." @@ -137,7 +202,7 @@ class EvalMetricResultPerInvocation(BaseModel): ) -class Interval(BaseModel): +class Interval(EvalBaseModel): """Represents a range of numeric values, e.g. [0 ,1] or (2,3) or [-1, 6).""" min_value: float = Field(description="The smaller end of the interval.") @@ -161,7 +226,7 @@ class Interval(BaseModel): ) -class MetricValueInfo(BaseModel): +class MetricValueInfo(EvalBaseModel): """Information about the type of metric value.""" interval: Optional[Interval] = Field( @@ -170,14 +235,9 @@ class MetricValueInfo(BaseModel): ) -class MetricInfo(BaseModel): +class MetricInfo(EvalBaseModel): """Information about the metric that are used for Evals.""" - model_config = ConfigDict( - alias_generator=alias_generators.to_camel, - populate_by_name=True, - ) - metric_name: str = Field(description="The name of the metric.") description: str = Field( diff --git a/src/google/adk/evaluation/eval_rubrics.py b/src/google/adk/evaluation/eval_rubrics.py new file mode 100644 index 00000000..8dd2f6ca --- /dev/null +++ b/src/google/adk/evaluation/eval_rubrics.py @@ -0,0 +1,82 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import Optional + +from pydantic import Field + +from .common import EvalBaseModel + + +class RubricContent(EvalBaseModel): + """The content of a rubric.""" + + text_property: Optional[str] = Field( + description=( + "The property being evaluated. Example: \"The agent's response is" + ' grammatically correct." ' + ) + ) + + +class Rubric(EvalBaseModel): + """This class represents a single Rubric.""" + + rubric_id: str = Field( + description="Unique identifier for the rubric.", + ) + + rubric_content: RubricContent = Field( + description="The actual testable criterion for the rubric." + ) + + description: Optional[str] = Field( + default=None, + description=( + "A description of the rubric that provide details on how the results" + " of the rubric assessment be interpreted." + ), + ) + + type: Optional[str] = Field( + default=None, + description="""Optional. A type designator for the rubric, which can + inform how it's evaluated or interpreted by systems or users. + + It's recommended to use consistent, well-defined, upper snake_case + strings. + + Examples: "TOOL_USE_QUALITY", "FINAL_RESPONSE_QUALITY", + "INSTRUCTION_ADHERENCE".""", + ) + + +class RubricScore(EvalBaseModel): + """The score obtained after applying the rubric to the Agent's response.""" + + rubric_id: str = Field(description="The id of the rubric that was assessed.") + + rationale: Optional[str] = Field( + default=None, description="Reasoning/rationale for the score." + ) + + score: Optional[float] = Field( + default=None, + description=( + "Score obtained after assessing the rubric. Optional, as assessment" + " might not have happened." + ), + ) diff --git a/src/google/adk/evaluation/evaluator.py b/src/google/adk/evaluation/evaluator.py index bc19313d..07ee9584 100644 --- a/src/google/adk/evaluation/evaluator.py +++ b/src/google/adk/evaluation/evaluator.py @@ -11,20 +11,21 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations from abc import ABC -from enum import Enum +from typing import ClassVar from typing import Optional from pydantic import BaseModel +from typing_extensions import TypeAlias from .eval_case import Invocation +from .eval_metrics import BaseCriterion +from .eval_metrics import EvalStatus - -class EvalStatus(Enum): - PASSED = 1 - FAILED = 2 - NOT_EVALUATED = 3 +# Redefining the type here for backward compatibility. +EvalStatus: TypeAlias = EvalStatus class PerInvocationResult(BaseModel): @@ -49,6 +50,8 @@ class EvaluationResult(BaseModel): class Evaluator(ABC): """A merics evaluator interface.""" + criterion_type: ClassVar[type[BaseCriterion]] = BaseCriterion + def evaluate_invocations( self, actual_invocations: list[Invocation], diff --git a/src/google/adk/evaluation/final_response_match_v2.py b/src/google/adk/evaluation/final_response_match_v2.py index 177e719a..827f397b 100644 --- a/src/google/adk/evaluation/final_response_match_v2.py +++ b/src/google/adk/evaluation/final_response_match_v2.py @@ -16,6 +16,7 @@ from __future__ import annotations import logging import re +from typing import ClassVar from typing import Optional from typing_extensions import override @@ -24,11 +25,12 @@ from ..models.llm_response import LlmResponse from ..utils.feature_decorator import experimental from .eval_case import Invocation from .eval_metrics import EvalMetric +from .eval_metrics import EvalStatus from .eval_metrics import Interval +from .eval_metrics import LlmAsAJudgeCriterion from .eval_metrics import MetricInfo from .eval_metrics import MetricValueInfo from .eval_metrics import PrebuiltMetrics -from .evaluator import EvalStatus from .evaluator import EvaluationResult from .evaluator import PerInvocationResult from .llm_as_judge import LlmAsJudge @@ -79,8 +81,6 @@ The answer should be a json alone which follows the json structure below: Answer with assertiveness: """ -_DEFAULT_NUM_SAMPLES = 5 - def _parse_critique(response: str) -> Label: """Parses the judge model critique and extracts the final label. @@ -140,15 +140,14 @@ class FinalResponseMatchV2Evaluator(LlmAsJudge): score indicate better final response performance of the agent. """ + criterion_type: ClassVar[type[LlmAsAJudgeCriterion]] = LlmAsAJudgeCriterion + def __init__( self, eval_metric: EvalMetric, ): - super().__init__(eval_metric) + super().__init__(eval_metric, FinalResponseMatchV2Evaluator.criterion_type) self._auto_rater_prompt_template = _FINAL_RESPONSE_MATCH_V2_PROMPT - assert self._eval_metric.judge_model_options is not None - if self._eval_metric.judge_model_options.num_samples is None: - self._eval_metric.judge_model_options.num_samples = _DEFAULT_NUM_SAMPLES @staticmethod def get_metric_info() -> MetricInfo: @@ -241,7 +240,7 @@ class FinalResponseMatchV2Evaluator(LlmAsJudge): return EvaluationResult( overall_score=overall_score, overall_eval_status=get_eval_status( - overall_score, self._eval_metric.threshold + overall_score, self._criterion.threshold ), per_invocation_results=per_invocation_results, ) diff --git a/src/google/adk/evaluation/llm_as_judge.py b/src/google/adk/evaluation/llm_as_judge.py index b17ee82d..cf86ffbb 100644 --- a/src/google/adk/evaluation/llm_as_judge.py +++ b/src/google/adk/evaluation/llm_as_judge.py @@ -18,6 +18,7 @@ from abc import abstractmethod from typing import Optional from google.genai import types as genai_types +from pydantic import ValidationError from typing_extensions import override from ..models.base_llm import BaseLlm @@ -26,6 +27,7 @@ from ..models.llm_response import LlmResponse from ..models.registry import LLMRegistry from ..utils.context_utils import Aclosing from .eval_case import Invocation +from .eval_metrics import BaseCriterion from .eval_metrics import EvalMetric from .evaluator import EvaluationResult from .evaluator import Evaluator @@ -49,17 +51,26 @@ class LlmAsJudge(Evaluator): """ def __init__( - self, - eval_metric: EvalMetric, + self, eval_metric: EvalMetric, criterion_type: type[BaseCriterion] ): self._eval_metric = eval_metric - if not eval_metric.judge_model_options: - raise ValueError("Judge model options is required for LlmAsJudge.") - self._judge_model_options = eval_metric.judge_model_options - if self._judge_model_options.judge_model_config is None: - self._judge_model_options.judge_model_config = ( - genai_types.GenerateContentConfig() + + expected_criterion_type_error = ValueError( + f"`{eval_metric.metric_name}` metric expects a criterion of type" + f" `{criterion_type}`." + ) + + try: + if self._eval_metric.criterion is None: + raise expected_criterion_type_error + + self._criterion = criterion_type.model_validate( + self._eval_metric.criterion.model_dump() ) + except ValidationError as e: + raise expected_criterion_type_error from e + + self._judge_model_options = self._criterion.judge_model_options self._judge_model = self._setup_auto_rater() @abstractmethod @@ -122,7 +133,7 @@ class LlmAsJudge(Evaluator): expected_invocation=expected, score=score, eval_status=get_eval_status( - score, self._eval_metric.threshold + score, self._criterion.threshold ), ) ) diff --git a/src/google/adk/evaluation/metric_evaluator_registry.py b/src/google/adk/evaluation/metric_evaluator_registry.py index e5fd33f4..e2bcd5f8 100644 --- a/src/google/adk/evaluation/metric_evaluator_registry.py +++ b/src/google/adk/evaluation/metric_evaluator_registry.py @@ -20,7 +20,6 @@ from ..errors.not_found_error import NotFoundError from ..utils.feature_decorator import experimental from .eval_metrics import EvalMetric from .eval_metrics import MetricInfo -from .eval_metrics import MetricName from .eval_metrics import PrebuiltMetrics from .evaluator import Evaluator from .final_response_match_v2 import FinalResponseMatchV2Evaluator diff --git a/tests/unittests/cli/test_cli_eval.py b/tests/unittests/cli/test_cli_eval.py new file mode 100644 index 00000000..2b284ded --- /dev/null +++ b/tests/unittests/cli/test_cli_eval.py @@ -0,0 +1,96 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from unittest import mock + +from google.adk.cli.cli_eval import _DEFAULT_EVAL_CONFIG +from google.adk.cli.cli_eval import get_eval_metrics_from_config +from google.adk.cli.cli_eval import get_evaluation_criteria_or_default +from google.adk.evaluation.eval_config import EvalConfig +from google.adk.evaluation.eval_rubrics import Rubric +from google.adk.evaluation.eval_rubrics import RubricContent + + +def test_get_evaluation_criteria_or_default_returns_default(): + assert get_evaluation_criteria_or_default("") == _DEFAULT_EVAL_CONFIG + + +def test_get_evaluation_criteria_or_default_reads_from_file(): + eval_config = EvalConfig( + criteria={"tool_trajectory_avg_score": 0.5, "response_match_score": 0.5} + ) + mock_open = mock.mock_open(read_data=eval_config.model_dump_json()) + with mock.patch("builtins.open", mock_open): + assert get_evaluation_criteria_or_default("dummy_path") == eval_config + + +def test_get_eval_metrics_from_config(): + rubric_1 = Rubric( + rubric_id="test-rubric", + rubric_content=RubricContent(text_property="test"), + ) + eval_config = EvalConfig( + criteria={ + "tool_trajectory_avg_score": 1.0, + "response_match_score": 0.8, + "final_response_match_v2": { + "threshold": 0.5, + "judge_model_options": { + "judge_model": "gemini-pro", + "num_samples": 1, + }, + }, + "rubric_based_final_response_quality_v1": { + "threshold": 0.9, + "judge_model_options": { + "judge_model": "gemini-ultra", + "num_samples": 1, + }, + "rubrics": [rubric_1], + }, + } + ) + eval_metrics = get_eval_metrics_from_config(eval_config) + + assert len(eval_metrics) == 4 + assert eval_metrics[0].metric_name == "tool_trajectory_avg_score" + assert eval_metrics[0].threshold == 1.0 + assert eval_metrics[0].criterion.threshold == 1.0 + assert eval_metrics[1].metric_name == "response_match_score" + assert eval_metrics[1].threshold == 0.8 + assert eval_metrics[1].criterion.threshold == 0.8 + assert eval_metrics[2].metric_name == "final_response_match_v2" + assert eval_metrics[2].threshold == 0.5 + assert eval_metrics[2].criterion.threshold == 0.5 + assert ( + eval_metrics[2].criterion.judge_model_options["judge_model"] + == "gemini-pro" + ) + assert eval_metrics[3].metric_name == "rubric_based_final_response_quality_v1" + assert eval_metrics[3].threshold == 0.9 + assert eval_metrics[3].criterion.threshold == 0.9 + assert ( + eval_metrics[3].criterion.judge_model_options["judge_model"] + == "gemini-ultra" + ) + assert len(eval_metrics[3].criterion.rubrics) == 1 + assert eval_metrics[3].criterion.rubrics[0] == rubric_1 + + +def test_get_eval_metrics_from_config_empty_criteria(): + eval_config = EvalConfig(criteria={}) + eval_metrics = get_eval_metrics_from_config(eval_config) + assert not eval_metrics diff --git a/tests/unittests/cli/test_fast_api.py b/tests/unittests/cli/test_fast_api.py index d0d84cac..d1e8dcab 100755 --- a/tests/unittests/cli/test_fast_api.py +++ b/tests/unittests/cli/test_fast_api.py @@ -840,6 +840,7 @@ def test_run_eval(test_app, create_test_eval_set): "threshold": 0.5, "score": 1.0, "evalStatus": 1, + "details": {}, }], } for k, v in expected_eval_case_result.items(): diff --git a/tests/unittests/evaluation/test_final_response_match_v2.py b/tests/unittests/evaluation/test_final_response_match_v2.py index 911c5e22..382b7a7d 100644 --- a/tests/unittests/evaluation/test_final_response_match_v2.py +++ b/tests/unittests/evaluation/test_final_response_match_v2.py @@ -15,6 +15,7 @@ from __future__ import annotations from google.adk.evaluation.eval_case import Invocation +from google.adk.evaluation.eval_metrics import BaseCriterion from google.adk.evaluation.eval_metrics import EvalMetric from google.adk.evaluation.eval_metrics import JudgeModelOptions from google.adk.evaluation.eval_metrics import PrebuiltMetrics @@ -130,9 +131,8 @@ def _create_test_evaluator_gemini( EvalMetric( metric_name="final_response_match_v2", threshold=threshold, - judge_model_options=JudgeModelOptions( - judge_model="gemini-2.5-flash", - num_samples=3, + criterion=BaseCriterion( + threshold=0.5, ), ), ) diff --git a/tests/unittests/evaluation/test_llm_as_judge.py b/tests/unittests/evaluation/test_llm_as_judge.py index 2a4ba13c..d03d88b2 100644 --- a/tests/unittests/evaluation/test_llm_as_judge.py +++ b/tests/unittests/evaluation/test_llm_as_judge.py @@ -20,6 +20,7 @@ from unittest.mock import MagicMock from google.adk.evaluation.eval_case import Invocation from google.adk.evaluation.eval_metrics import EvalMetric from google.adk.evaluation.eval_metrics import JudgeModelOptions +from google.adk.evaluation.eval_metrics import LlmAsAJudgeCriterion from google.adk.evaluation.evaluator import EvalStatus from google.adk.evaluation.evaluator import EvaluationResult from google.adk.evaluation.evaluator import PerInvocationResult @@ -60,15 +61,19 @@ class MockLlmAsJudge(LlmAsJudge): @pytest.fixture def mock_llm_as_judge(): return MockLlmAsJudge( - EvalMetric( + eval_metric=EvalMetric( metric_name="test_metric", threshold=0.5, - judge_model_options=JudgeModelOptions( - judge_model="gemini-2.5-flash", - judge_model_config=genai_types.GenerateContentConfig(), - num_samples=3, + criterion=LlmAsAJudgeCriterion( + threshold=0.5, + judge_model_options=JudgeModelOptions( + judge_model="gemini-2.5-flash", + judge_model_config=genai_types.GenerateContentConfig(), + num_samples=3, + ), ), ), + criterion_type=LlmAsAJudgeCriterion, ) @@ -94,10 +99,11 @@ def test_get_eval_status(): assert get_eval_status(score=None, threshold=0.8) == EvalStatus.NOT_EVALUATED -def test_llm_as_judge_init_missing_judge_model_options(): +def test_llm_as_judge_init_missing_criterion(): with pytest.raises(ValueError): MockLlmAsJudge( EvalMetric(metric_name="test_metric", threshold=0.8), + criterion_type=LlmAsAJudgeCriterion, ) @@ -107,10 +113,16 @@ def test_llm_as_judge_init_unregistered_model(): EvalMetric( metric_name="test_metric", threshold=0.8, - judge_model_options=JudgeModelOptions( - judge_model="unregistered_model", + criterion=LlmAsAJudgeCriterion( + threshold=0.5, + judge_model_options=JudgeModelOptions( + judge_model="unregistered_model", + judge_model_config=genai_types.GenerateContentConfig(), + num_samples=3, + ), ), ), + criterion_type=LlmAsAJudgeCriterion, )