feat: Data model for Rubric based metric and eval config

Details:
- We plan on introducing Rubric based metrics in subsequent changes. This change introduces the data model needed that allows agent developer to provide rubrics.

- We also introduce a data model for the config that the eval system has been using for quite some time. It was loosely and informally described as a dictionary of metric names and expected thresholds. In this change, we actually formalize it using a pydantic data model, and extend it allow developers to specify rubrics as a part of their eval config.

What is a rubric based metric?
A rubric based metric is the assessment of a Agent's response (final or intermediate) along some rubric. This evaluation of agent's response significantly differs from the strategy where one has to provide a golden response.

PiperOrigin-RevId: 805488436
This commit is contained in:
Ankur Sharma
2025-09-10 13:19:32 -07:00
committed by Copybara-Service
parent 37228beddd
commit e88e667770
14 changed files with 484 additions and 98 deletions
+41 -14
View File
@@ -37,6 +37,8 @@ from ..evaluation.base_eval_service import InferenceRequest
from ..evaluation.base_eval_service import InferenceResult
from ..evaluation.constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
from ..evaluation.eval_case import EvalCase
from ..evaluation.eval_config import BaseCriterion
from ..evaluation.eval_config import EvalConfig
from ..evaluation.eval_metrics import EvalMetric
from ..evaluation.eval_metrics import EvalMetricResult
from ..evaluation.eval_metrics import EvalMetricResultPerInvocation
@@ -64,6 +66,10 @@ DEFAULT_CRITERIA = {
RESPONSE_MATCH_SCORE_KEY: 0.8,
}
_DEFAULT_EVAL_CONFIG = EvalConfig(
criteria={"tool_trajectory_avg_score": 1.0, "response_match_score": 0.8}
)
def _import_from_path(module_name, file_path):
spec = importlib.util.spec_from_file_location(module_name, file_path)
@@ -81,27 +87,48 @@ def _get_agent_module(agent_module_file_path: str):
def get_evaluation_criteria_or_default(
eval_config_file_path: str,
) -> dict[str, float]:
"""Returns evaluation criteria from the config file, if present.
) -> EvalConfig:
"""Returns EvalConfig read from the config file, if present.
Otherwise a default one is returned.
"""
if eval_config_file_path:
with open(eval_config_file_path, "r", encoding="utf-8") as f:
config_data = json.load(f)
content = f.read()
return EvalConfig.model_validate_json(content)
if "criteria" in config_data and isinstance(config_data["criteria"], dict):
evaluation_criteria = config_data["criteria"]
else:
raise ValueError(
f"Invalid format for test_config.json at {eval_config_file_path}."
" Expected a 'criteria' dictionary."
)
else:
logger.info("No config file supplied. Using default criteria.")
evaluation_criteria = DEFAULT_CRITERIA
logger.info("No config file supplied. Using default criteria.")
return _DEFAULT_EVAL_CONFIG
return evaluation_criteria
def get_eval_metrics_from_config(eval_config: EvalConfig) -> list[EvalMetric]:
"""Returns a list of EvalMetrics mapped from the EvalConfig."""
eval_metric_list = []
if eval_config.criteria:
for metric_name, criterion in eval_config.criteria.items():
if isinstance(criterion, float):
eval_metric_list.append(
EvalMetric(
metric_name=metric_name,
threshold=criterion,
criterion=BaseCriterion(threshold=criterion),
)
)
elif isinstance(criterion, BaseCriterion):
eval_metric_list.append(
EvalMetric(
metric_name=metric_name,
threshold=criterion.threshold,
criterion=criterion,
)
)
else:
raise ValueError(
f"Unexpected criterion type. {type(criterion).__name__} not"
" supported."
)
return eval_metric_list
def get_root_agent(agent_module_file_path: str) -> Agent:
+24 -20
View File
@@ -382,24 +382,16 @@ def cli_eval(
from ..evaluation.local_eval_sets_manager import LocalEvalSetsManager
from .cli_eval import _collect_eval_results
from .cli_eval import _collect_inferences
from .cli_eval import get_eval_metrics_from_config
from .cli_eval import get_evaluation_criteria_or_default
from .cli_eval import get_root_agent
from .cli_eval import parse_and_get_evals_to_run
except ModuleNotFoundError as mnf:
raise click.ClickException(MISSING_EVAL_DEPENDENCIES_MESSAGE) from mnf
evaluation_criteria = get_evaluation_criteria_or_default(config_file_path)
eval_metrics = []
for metric_name, threshold in evaluation_criteria.items():
eval_metrics.append(
EvalMetric(
metric_name=metric_name,
threshold=threshold,
judge_model_options=JudgeModelOptions(),
)
)
print(f"Using evaluation criteria: {evaluation_criteria}")
eval_config = get_evaluation_criteria_or_default(config_file_path)
print(f"Using evaluation criteria: {eval_config}")
eval_metrics = get_eval_metrics_from_config(eval_config)
root_agent = get_root_agent(agent_module_file_path)
app_name = os.path.basename(agent_module_file_path)
@@ -500,7 +492,9 @@ def cli_eval(
except ModuleNotFoundError as mnf:
raise click.ClickException(MISSING_EVAL_DEPENDENCIES_MESSAGE) from mnf
print("*********************************************************************")
click.echo(
"*********************************************************************"
)
eval_run_summary = {}
for eval_result in eval_results:
@@ -513,9 +507,9 @@ def cli_eval(
eval_run_summary[eval_result.eval_set_id][0] += 1
else:
eval_run_summary[eval_result.eval_set_id][1] += 1
print("Eval Run Summary")
click.echo("Eval Run Summary")
for eval_set_id, pass_fail_count in eval_run_summary.items():
print(
click.echo(
f"{eval_set_id}:\n Tests passed: {pass_fail_count[0]}\n Tests"
f" failed: {pass_fail_count[1]}"
)
@@ -523,10 +517,17 @@ def cli_eval(
if print_detailed_results:
for eval_result in eval_results:
eval_result: EvalCaseResult
print(
click.echo(
"*********************************************************************"
)
print(eval_result.model_dump_json(indent=2))
click.echo(
eval_result.model_dump_json(
indent=2,
exclude_unset=True,
exclude_defaults=True,
exclude_none=True,
)
)
def adk_services_options():
@@ -1010,7 +1011,8 @@ def cli_deploy_cloud_run(
adk deploy cloud_run --project=[project] --region=[region] path/to/my_agent
adk deploy cloud_run --project=[project] --region=[region] path/to/my_agent -- --no-allow-unauthenticated --min-instances=2
adk deploy cloud_run --project=[project] --region=[region] path/to/my_agent
-- --no-allow-unauthenticated --min-instances=2
"""
if verbosity:
click.secho(
@@ -1222,7 +1224,8 @@ def cli_deploy_agent_engine(
Example:
adk deploy agent_engine --project=[project] --region=[region]
--staging_bucket=[staging_bucket] --display_name=[app_name] path/to/my_agent
--staging_bucket=[staging_bucket] --display_name=[app_name]
path/to/my_agent
"""
try:
cli_deploy.to_agent_engine(
@@ -1367,7 +1370,8 @@ def cli_deploy_gke(
Example:
adk deploy gke --project=[project] --region=[region] --cluster_name=[cluster_name] path/to/my_agent
adk deploy gke --project=[project] --region=[region]
--cluster_name=[cluster_name] path/to/my_agent
"""
try:
cli_deploy.to_gke(
+26
View File
@@ -0,0 +1,26 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import pydantic
from pydantic import alias_generators
class EvalBaseModel(pydantic.BaseModel):
model_config = pydantic.ConfigDict(
alias_generator=alias_generators.to_camel,
populate_by_name=True,
extra='forbid',
)
+66
View File
@@ -0,0 +1,66 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
from typing import Union
from pydantic import alias_generators
from pydantic import BaseModel
from pydantic import ConfigDict
from pydantic import Field
from .eval_metrics import BaseCriterion
from .eval_metrics import Threshold
class EvalConfig(BaseModel):
"""Configurations needed to run an Eval.
Allows users to specify metrics, their thresholds and other properties.
"""
model_config = ConfigDict(
alias_generator=alias_generators.to_camel,
populate_by_name=True,
)
criteria: dict[str, Union[Threshold, BaseCriterion]] = Field(
default_factory=dict,
description="""A dictionary that maps criterion to be used for a metric.
The key of the dictionary is the name of the eval metric and the value is the
criterion to be used.
In the sample below, `tool_trajectory_avg_score`, `response_match_score` and
`final_response_match_v2` are the standard eval metric names, represented as
keys in the dictionary. The values in the dictionary are the corresponding
criterions. For the first two metrics, we use simple threshold as the criterion,
the third one uses `LlmAsAJudgeCriterion`.
{
"criteria": {
"tool_trajectory_avg_score": 1.0,
"response_match_score": 0.5,
"final_response_match_v2": {
"threshold": 0.5,
"judge_model_options": {
"judge_model": "my favorite LLM",
"num_samples": 5
}
}
},
}
}
""",
)
+89 -29
View File
@@ -25,8 +25,16 @@ from pydantic import ConfigDict
from pydantic import Field
from typing_extensions import TypeAlias
from .common import EvalBaseModel
from .eval_case import Invocation
from .evaluator import EvalStatus
from .eval_rubrics import Rubric
from .eval_rubrics import RubricScore
class EvalStatus(Enum):
PASSED = 1
FAILED = 2
NOT_EVALUATED = 3
class PrebuiltMetrics(Enum):
@@ -42,9 +50,10 @@ class PrebuiltMetrics(Enum):
MetricName: TypeAlias = Union[str, PrebuiltMetrics]
Threshold: TypeAlias = float
class JudgeModelOptions(BaseModel):
class JudgeModelOptions(EvalBaseModel):
"""Options for an eval metric's judge model."""
judge_model: str = Field(
@@ -55,27 +64,69 @@ class JudgeModelOptions(BaseModel):
)
judge_model_config: Optional[genai_types.GenerateContentConfig] = Field(
default=None,
default=genai_types.GenerateContentConfig,
description="The configuration for the judge model.",
)
num_samples: Optional[int] = Field(
default=None,
num_samples: int = Field(
default=5,
description=(
"The number of times to sample the model for each invocation"
" evaluation."
" evaluation. Given that models tend to have certain degree of"
" unreliability to them, we repeatedly sample them with the same"
" data. These repeated invocation are them aggregated using some"
" strategy. From experimentation, we have found 5 to be a good"
" default."
),
)
class EvalMetric(BaseModel):
"""A metric used to evaluate a particular aspect of an eval case."""
class BaseCriterion(BaseModel):
"""Base creterion to use for an Eval Metric."""
model_config = ConfigDict(
alias_generator=alias_generators.to_camel,
populate_by_name=True,
extra="allow",
)
threshold: Threshold = Field(
description="The threshold to be used by the metric.",
)
class LlmAsAJudgeCriterion(BaseCriterion):
"""Criterion when using LLM-As-A-Judge metric."""
judge_model_options: JudgeModelOptions = Field(
default_factory=JudgeModelOptions,
description="Options for the judge model.",
)
class RubricsBasedCriterion(BaseCriterion):
"""Criterion when using a rubric based metric."""
judge_model_options: JudgeModelOptions = Field(
default_factory=JudgeModelOptions,
description="Options for the judge model.",
)
rubrics: list[Rubric] = Field(
default_factory=list,
description=(
"Rubrics to be used by Metric. Not all metrics rely on rubrics, but"
" metrics like `rubric_based_final_response_quality_v1` do. Metrics"
" that don't use Rubrics, will just ignore this field, if specified."
" Metrics that do use rubrics will raise an execption, if they are"
" not specified."
),
)
class EvalMetric(EvalBaseModel):
"""A metric used to evaluate a particular aspect of an eval case."""
metric_name: str = Field(
description="The name of the metric.",
)
@@ -88,19 +139,33 @@ class EvalMetric(BaseModel):
)
judge_model_options: Optional[JudgeModelOptions] = Field(
deprecated=True,
default=None,
description="Options for the judge model.",
description=(
"[DEPRECATED] This field is deprecated in favor of `criterion`."
" Depending on the metric you may want to one of the sub-classes of"
" BaseCriterion."
),
)
criterion: Optional[BaseCriterion] = Field(
default=None, description="""Evaluation criterion used by the metric."""
)
class EvalMetricResultDetails(EvalBaseModel):
rubric_scores: Optional[list[RubricScore]] = Field(
default=None,
description=(
"The scores obtained after applying the rubrics to the Agent's"
" response."
),
)
class EvalMetricResult(EvalMetric):
"""The actual computed score/value of a particular EvalMetric."""
model_config = ConfigDict(
alias_generator=alias_generators.to_camel,
populate_by_name=True,
)
score: Optional[float] = Field(
default=None,
description=(
@@ -108,17 +173,17 @@ class EvalMetricResult(EvalMetric):
" might not have happened."
),
)
eval_status: EvalStatus = Field(description="The status of this evaluation.")
class EvalMetricResultPerInvocation(BaseModel):
"""Eval metric results per invocation."""
model_config = ConfigDict(
alias_generator=alias_generators.to_camel,
populate_by_name=True,
details: EvalMetricResultDetails = Field(
default_factory=EvalMetricResultDetails, description=""""""
)
class EvalMetricResultPerInvocation(EvalBaseModel):
"""Eval metric results per invocation."""
actual_invocation: Invocation = Field(
description=(
"The actual invocation, usually obtained by inferencing the agent."
@@ -137,7 +202,7 @@ class EvalMetricResultPerInvocation(BaseModel):
)
class Interval(BaseModel):
class Interval(EvalBaseModel):
"""Represents a range of numeric values, e.g. [0 ,1] or (2,3) or [-1, 6)."""
min_value: float = Field(description="The smaller end of the interval.")
@@ -161,7 +226,7 @@ class Interval(BaseModel):
)
class MetricValueInfo(BaseModel):
class MetricValueInfo(EvalBaseModel):
"""Information about the type of metric value."""
interval: Optional[Interval] = Field(
@@ -170,14 +235,9 @@ class MetricValueInfo(BaseModel):
)
class MetricInfo(BaseModel):
class MetricInfo(EvalBaseModel):
"""Information about the metric that are used for Evals."""
model_config = ConfigDict(
alias_generator=alias_generators.to_camel,
populate_by_name=True,
)
metric_name: str = Field(description="The name of the metric.")
description: str = Field(
+82
View File
@@ -0,0 +1,82 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
from typing import Optional
from pydantic import Field
from .common import EvalBaseModel
class RubricContent(EvalBaseModel):
"""The content of a rubric."""
text_property: Optional[str] = Field(
description=(
"The property being evaluated. Example: \"The agent's response is"
' grammatically correct." '
)
)
class Rubric(EvalBaseModel):
"""This class represents a single Rubric."""
rubric_id: str = Field(
description="Unique identifier for the rubric.",
)
rubric_content: RubricContent = Field(
description="The actual testable criterion for the rubric."
)
description: Optional[str] = Field(
default=None,
description=(
"A description of the rubric that provide details on how the results"
" of the rubric assessment be interpreted."
),
)
type: Optional[str] = Field(
default=None,
description="""Optional. A type designator for the rubric, which can
inform how it's evaluated or interpreted by systems or users.
It's recommended to use consistent, well-defined, upper snake_case
strings.
Examples: "TOOL_USE_QUALITY", "FINAL_RESPONSE_QUALITY",
"INSTRUCTION_ADHERENCE".""",
)
class RubricScore(EvalBaseModel):
"""The score obtained after applying the rubric to the Agent's response."""
rubric_id: str = Field(description="The id of the rubric that was assessed.")
rationale: Optional[str] = Field(
default=None, description="Reasoning/rationale for the score."
)
score: Optional[float] = Field(
default=None,
description=(
"Score obtained after assessing the rubric. Optional, as assessment"
" might not have happened."
),
)
+9 -6
View File
@@ -11,20 +11,21 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
from abc import ABC
from enum import Enum
from typing import ClassVar
from typing import Optional
from pydantic import BaseModel
from typing_extensions import TypeAlias
from .eval_case import Invocation
from .eval_metrics import BaseCriterion
from .eval_metrics import EvalStatus
class EvalStatus(Enum):
PASSED = 1
FAILED = 2
NOT_EVALUATED = 3
# Redefining the type here for backward compatibility.
EvalStatus: TypeAlias = EvalStatus
class PerInvocationResult(BaseModel):
@@ -49,6 +50,8 @@ class EvaluationResult(BaseModel):
class Evaluator(ABC):
"""A merics evaluator interface."""
criterion_type: ClassVar[type[BaseCriterion]] = BaseCriterion
def evaluate_invocations(
self,
actual_invocations: list[Invocation],
@@ -16,6 +16,7 @@ from __future__ import annotations
import logging
import re
from typing import ClassVar
from typing import Optional
from typing_extensions import override
@@ -24,11 +25,12 @@ from ..models.llm_response import LlmResponse
from ..utils.feature_decorator import experimental
from .eval_case import Invocation
from .eval_metrics import EvalMetric
from .eval_metrics import EvalStatus
from .eval_metrics import Interval
from .eval_metrics import LlmAsAJudgeCriterion
from .eval_metrics import MetricInfo
from .eval_metrics import MetricValueInfo
from .eval_metrics import PrebuiltMetrics
from .evaluator import EvalStatus
from .evaluator import EvaluationResult
from .evaluator import PerInvocationResult
from .llm_as_judge import LlmAsJudge
@@ -79,8 +81,6 @@ The answer should be a json alone which follows the json structure below:
Answer with assertiveness:
"""
_DEFAULT_NUM_SAMPLES = 5
def _parse_critique(response: str) -> Label:
"""Parses the judge model critique and extracts the final label.
@@ -140,15 +140,14 @@ class FinalResponseMatchV2Evaluator(LlmAsJudge):
score indicate better final response performance of the agent.
"""
criterion_type: ClassVar[type[LlmAsAJudgeCriterion]] = LlmAsAJudgeCriterion
def __init__(
self,
eval_metric: EvalMetric,
):
super().__init__(eval_metric)
super().__init__(eval_metric, FinalResponseMatchV2Evaluator.criterion_type)
self._auto_rater_prompt_template = _FINAL_RESPONSE_MATCH_V2_PROMPT
assert self._eval_metric.judge_model_options is not None
if self._eval_metric.judge_model_options.num_samples is None:
self._eval_metric.judge_model_options.num_samples = _DEFAULT_NUM_SAMPLES
@staticmethod
def get_metric_info() -> MetricInfo:
@@ -241,7 +240,7 @@ class FinalResponseMatchV2Evaluator(LlmAsJudge):
return EvaluationResult(
overall_score=overall_score,
overall_eval_status=get_eval_status(
overall_score, self._eval_metric.threshold
overall_score, self._criterion.threshold
),
per_invocation_results=per_invocation_results,
)
+20 -9
View File
@@ -18,6 +18,7 @@ from abc import abstractmethod
from typing import Optional
from google.genai import types as genai_types
from pydantic import ValidationError
from typing_extensions import override
from ..models.base_llm import BaseLlm
@@ -26,6 +27,7 @@ from ..models.llm_response import LlmResponse
from ..models.registry import LLMRegistry
from ..utils.context_utils import Aclosing
from .eval_case import Invocation
from .eval_metrics import BaseCriterion
from .eval_metrics import EvalMetric
from .evaluator import EvaluationResult
from .evaluator import Evaluator
@@ -49,17 +51,26 @@ class LlmAsJudge(Evaluator):
"""
def __init__(
self,
eval_metric: EvalMetric,
self, eval_metric: EvalMetric, criterion_type: type[BaseCriterion]
):
self._eval_metric = eval_metric
if not eval_metric.judge_model_options:
raise ValueError("Judge model options is required for LlmAsJudge.")
self._judge_model_options = eval_metric.judge_model_options
if self._judge_model_options.judge_model_config is None:
self._judge_model_options.judge_model_config = (
genai_types.GenerateContentConfig()
expected_criterion_type_error = ValueError(
f"`{eval_metric.metric_name}` metric expects a criterion of type"
f" `{criterion_type}`."
)
try:
if self._eval_metric.criterion is None:
raise expected_criterion_type_error
self._criterion = criterion_type.model_validate(
self._eval_metric.criterion.model_dump()
)
except ValidationError as e:
raise expected_criterion_type_error from e
self._judge_model_options = self._criterion.judge_model_options
self._judge_model = self._setup_auto_rater()
@abstractmethod
@@ -122,7 +133,7 @@ class LlmAsJudge(Evaluator):
expected_invocation=expected,
score=score,
eval_status=get_eval_status(
score, self._eval_metric.threshold
score, self._criterion.threshold
),
)
)
@@ -20,7 +20,6 @@ from ..errors.not_found_error import NotFoundError
from ..utils.feature_decorator import experimental
from .eval_metrics import EvalMetric
from .eval_metrics import MetricInfo
from .eval_metrics import MetricName
from .eval_metrics import PrebuiltMetrics
from .evaluator import Evaluator
from .final_response_match_v2 import FinalResponseMatchV2Evaluator
+96
View File
@@ -0,0 +1,96 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
from unittest import mock
from google.adk.cli.cli_eval import _DEFAULT_EVAL_CONFIG
from google.adk.cli.cli_eval import get_eval_metrics_from_config
from google.adk.cli.cli_eval import get_evaluation_criteria_or_default
from google.adk.evaluation.eval_config import EvalConfig
from google.adk.evaluation.eval_rubrics import Rubric
from google.adk.evaluation.eval_rubrics import RubricContent
def test_get_evaluation_criteria_or_default_returns_default():
assert get_evaluation_criteria_or_default("") == _DEFAULT_EVAL_CONFIG
def test_get_evaluation_criteria_or_default_reads_from_file():
eval_config = EvalConfig(
criteria={"tool_trajectory_avg_score": 0.5, "response_match_score": 0.5}
)
mock_open = mock.mock_open(read_data=eval_config.model_dump_json())
with mock.patch("builtins.open", mock_open):
assert get_evaluation_criteria_or_default("dummy_path") == eval_config
def test_get_eval_metrics_from_config():
rubric_1 = Rubric(
rubric_id="test-rubric",
rubric_content=RubricContent(text_property="test"),
)
eval_config = EvalConfig(
criteria={
"tool_trajectory_avg_score": 1.0,
"response_match_score": 0.8,
"final_response_match_v2": {
"threshold": 0.5,
"judge_model_options": {
"judge_model": "gemini-pro",
"num_samples": 1,
},
},
"rubric_based_final_response_quality_v1": {
"threshold": 0.9,
"judge_model_options": {
"judge_model": "gemini-ultra",
"num_samples": 1,
},
"rubrics": [rubric_1],
},
}
)
eval_metrics = get_eval_metrics_from_config(eval_config)
assert len(eval_metrics) == 4
assert eval_metrics[0].metric_name == "tool_trajectory_avg_score"
assert eval_metrics[0].threshold == 1.0
assert eval_metrics[0].criterion.threshold == 1.0
assert eval_metrics[1].metric_name == "response_match_score"
assert eval_metrics[1].threshold == 0.8
assert eval_metrics[1].criterion.threshold == 0.8
assert eval_metrics[2].metric_name == "final_response_match_v2"
assert eval_metrics[2].threshold == 0.5
assert eval_metrics[2].criterion.threshold == 0.5
assert (
eval_metrics[2].criterion.judge_model_options["judge_model"]
== "gemini-pro"
)
assert eval_metrics[3].metric_name == "rubric_based_final_response_quality_v1"
assert eval_metrics[3].threshold == 0.9
assert eval_metrics[3].criterion.threshold == 0.9
assert (
eval_metrics[3].criterion.judge_model_options["judge_model"]
== "gemini-ultra"
)
assert len(eval_metrics[3].criterion.rubrics) == 1
assert eval_metrics[3].criterion.rubrics[0] == rubric_1
def test_get_eval_metrics_from_config_empty_criteria():
eval_config = EvalConfig(criteria={})
eval_metrics = get_eval_metrics_from_config(eval_config)
assert not eval_metrics
+1
View File
@@ -840,6 +840,7 @@ def test_run_eval(test_app, create_test_eval_set):
"threshold": 0.5,
"score": 1.0,
"evalStatus": 1,
"details": {},
}],
}
for k, v in expected_eval_case_result.items():
@@ -15,6 +15,7 @@
from __future__ import annotations
from google.adk.evaluation.eval_case import Invocation
from google.adk.evaluation.eval_metrics import BaseCriterion
from google.adk.evaluation.eval_metrics import EvalMetric
from google.adk.evaluation.eval_metrics import JudgeModelOptions
from google.adk.evaluation.eval_metrics import PrebuiltMetrics
@@ -130,9 +131,8 @@ def _create_test_evaluator_gemini(
EvalMetric(
metric_name="final_response_match_v2",
threshold=threshold,
judge_model_options=JudgeModelOptions(
judge_model="gemini-2.5-flash",
num_samples=3,
criterion=BaseCriterion(
threshold=0.5,
),
),
)
@@ -20,6 +20,7 @@ from unittest.mock import MagicMock
from google.adk.evaluation.eval_case import Invocation
from google.adk.evaluation.eval_metrics import EvalMetric
from google.adk.evaluation.eval_metrics import JudgeModelOptions
from google.adk.evaluation.eval_metrics import LlmAsAJudgeCriterion
from google.adk.evaluation.evaluator import EvalStatus
from google.adk.evaluation.evaluator import EvaluationResult
from google.adk.evaluation.evaluator import PerInvocationResult
@@ -60,15 +61,19 @@ class MockLlmAsJudge(LlmAsJudge):
@pytest.fixture
def mock_llm_as_judge():
return MockLlmAsJudge(
EvalMetric(
eval_metric=EvalMetric(
metric_name="test_metric",
threshold=0.5,
judge_model_options=JudgeModelOptions(
judge_model="gemini-2.5-flash",
judge_model_config=genai_types.GenerateContentConfig(),
num_samples=3,
criterion=LlmAsAJudgeCriterion(
threshold=0.5,
judge_model_options=JudgeModelOptions(
judge_model="gemini-2.5-flash",
judge_model_config=genai_types.GenerateContentConfig(),
num_samples=3,
),
),
),
criterion_type=LlmAsAJudgeCriterion,
)
@@ -94,10 +99,11 @@ def test_get_eval_status():
assert get_eval_status(score=None, threshold=0.8) == EvalStatus.NOT_EVALUATED
def test_llm_as_judge_init_missing_judge_model_options():
def test_llm_as_judge_init_missing_criterion():
with pytest.raises(ValueError):
MockLlmAsJudge(
EvalMetric(metric_name="test_metric", threshold=0.8),
criterion_type=LlmAsAJudgeCriterion,
)
@@ -107,10 +113,16 @@ def test_llm_as_judge_init_unregistered_model():
EvalMetric(
metric_name="test_metric",
threshold=0.8,
judge_model_options=JudgeModelOptions(
judge_model="unregistered_model",
criterion=LlmAsAJudgeCriterion(
threshold=0.5,
judge_model_options=JudgeModelOptions(
judge_model="unregistered_model",
judge_model_config=genai_types.GenerateContentConfig(),
num_samples=3,
),
),
),
criterion_type=LlmAsAJudgeCriterion,
)