You've already forked adk-python
mirror of
https://github.com/encounter/adk-python.git
synced 2026-03-30 10:57:20 -07:00
feat: Data model for Rubric based metric and eval config
Details: - We plan on introducing Rubric based metrics in subsequent changes. This change introduces the data model needed that allows agent developer to provide rubrics. - We also introduce a data model for the config that the eval system has been using for quite some time. It was loosely and informally described as a dictionary of metric names and expected thresholds. In this change, we actually formalize it using a pydantic data model, and extend it allow developers to specify rubrics as a part of their eval config. What is a rubric based metric? A rubric based metric is the assessment of a Agent's response (final or intermediate) along some rubric. This evaluation of agent's response significantly differs from the strategy where one has to provide a golden response. PiperOrigin-RevId: 805488436
This commit is contained in:
committed by
Copybara-Service
parent
37228beddd
commit
e88e667770
@@ -37,6 +37,8 @@ from ..evaluation.base_eval_service import InferenceRequest
|
||||
from ..evaluation.base_eval_service import InferenceResult
|
||||
from ..evaluation.constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
|
||||
from ..evaluation.eval_case import EvalCase
|
||||
from ..evaluation.eval_config import BaseCriterion
|
||||
from ..evaluation.eval_config import EvalConfig
|
||||
from ..evaluation.eval_metrics import EvalMetric
|
||||
from ..evaluation.eval_metrics import EvalMetricResult
|
||||
from ..evaluation.eval_metrics import EvalMetricResultPerInvocation
|
||||
@@ -64,6 +66,10 @@ DEFAULT_CRITERIA = {
|
||||
RESPONSE_MATCH_SCORE_KEY: 0.8,
|
||||
}
|
||||
|
||||
_DEFAULT_EVAL_CONFIG = EvalConfig(
|
||||
criteria={"tool_trajectory_avg_score": 1.0, "response_match_score": 0.8}
|
||||
)
|
||||
|
||||
|
||||
def _import_from_path(module_name, file_path):
|
||||
spec = importlib.util.spec_from_file_location(module_name, file_path)
|
||||
@@ -81,27 +87,48 @@ def _get_agent_module(agent_module_file_path: str):
|
||||
|
||||
def get_evaluation_criteria_or_default(
|
||||
eval_config_file_path: str,
|
||||
) -> dict[str, float]:
|
||||
"""Returns evaluation criteria from the config file, if present.
|
||||
) -> EvalConfig:
|
||||
"""Returns EvalConfig read from the config file, if present.
|
||||
|
||||
Otherwise a default one is returned.
|
||||
"""
|
||||
if eval_config_file_path:
|
||||
with open(eval_config_file_path, "r", encoding="utf-8") as f:
|
||||
config_data = json.load(f)
|
||||
content = f.read()
|
||||
return EvalConfig.model_validate_json(content)
|
||||
|
||||
if "criteria" in config_data and isinstance(config_data["criteria"], dict):
|
||||
evaluation_criteria = config_data["criteria"]
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Invalid format for test_config.json at {eval_config_file_path}."
|
||||
" Expected a 'criteria' dictionary."
|
||||
)
|
||||
else:
|
||||
logger.info("No config file supplied. Using default criteria.")
|
||||
evaluation_criteria = DEFAULT_CRITERIA
|
||||
logger.info("No config file supplied. Using default criteria.")
|
||||
return _DEFAULT_EVAL_CONFIG
|
||||
|
||||
return evaluation_criteria
|
||||
|
||||
def get_eval_metrics_from_config(eval_config: EvalConfig) -> list[EvalMetric]:
|
||||
"""Returns a list of EvalMetrics mapped from the EvalConfig."""
|
||||
eval_metric_list = []
|
||||
if eval_config.criteria:
|
||||
for metric_name, criterion in eval_config.criteria.items():
|
||||
if isinstance(criterion, float):
|
||||
eval_metric_list.append(
|
||||
EvalMetric(
|
||||
metric_name=metric_name,
|
||||
threshold=criterion,
|
||||
criterion=BaseCriterion(threshold=criterion),
|
||||
)
|
||||
)
|
||||
elif isinstance(criterion, BaseCriterion):
|
||||
eval_metric_list.append(
|
||||
EvalMetric(
|
||||
metric_name=metric_name,
|
||||
threshold=criterion.threshold,
|
||||
criterion=criterion,
|
||||
)
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unexpected criterion type. {type(criterion).__name__} not"
|
||||
" supported."
|
||||
)
|
||||
|
||||
return eval_metric_list
|
||||
|
||||
|
||||
def get_root_agent(agent_module_file_path: str) -> Agent:
|
||||
|
||||
@@ -382,24 +382,16 @@ def cli_eval(
|
||||
from ..evaluation.local_eval_sets_manager import LocalEvalSetsManager
|
||||
from .cli_eval import _collect_eval_results
|
||||
from .cli_eval import _collect_inferences
|
||||
from .cli_eval import get_eval_metrics_from_config
|
||||
from .cli_eval import get_evaluation_criteria_or_default
|
||||
from .cli_eval import get_root_agent
|
||||
from .cli_eval import parse_and_get_evals_to_run
|
||||
except ModuleNotFoundError as mnf:
|
||||
raise click.ClickException(MISSING_EVAL_DEPENDENCIES_MESSAGE) from mnf
|
||||
|
||||
evaluation_criteria = get_evaluation_criteria_or_default(config_file_path)
|
||||
eval_metrics = []
|
||||
for metric_name, threshold in evaluation_criteria.items():
|
||||
eval_metrics.append(
|
||||
EvalMetric(
|
||||
metric_name=metric_name,
|
||||
threshold=threshold,
|
||||
judge_model_options=JudgeModelOptions(),
|
||||
)
|
||||
)
|
||||
|
||||
print(f"Using evaluation criteria: {evaluation_criteria}")
|
||||
eval_config = get_evaluation_criteria_or_default(config_file_path)
|
||||
print(f"Using evaluation criteria: {eval_config}")
|
||||
eval_metrics = get_eval_metrics_from_config(eval_config)
|
||||
|
||||
root_agent = get_root_agent(agent_module_file_path)
|
||||
app_name = os.path.basename(agent_module_file_path)
|
||||
@@ -500,7 +492,9 @@ def cli_eval(
|
||||
except ModuleNotFoundError as mnf:
|
||||
raise click.ClickException(MISSING_EVAL_DEPENDENCIES_MESSAGE) from mnf
|
||||
|
||||
print("*********************************************************************")
|
||||
click.echo(
|
||||
"*********************************************************************"
|
||||
)
|
||||
eval_run_summary = {}
|
||||
|
||||
for eval_result in eval_results:
|
||||
@@ -513,9 +507,9 @@ def cli_eval(
|
||||
eval_run_summary[eval_result.eval_set_id][0] += 1
|
||||
else:
|
||||
eval_run_summary[eval_result.eval_set_id][1] += 1
|
||||
print("Eval Run Summary")
|
||||
click.echo("Eval Run Summary")
|
||||
for eval_set_id, pass_fail_count in eval_run_summary.items():
|
||||
print(
|
||||
click.echo(
|
||||
f"{eval_set_id}:\n Tests passed: {pass_fail_count[0]}\n Tests"
|
||||
f" failed: {pass_fail_count[1]}"
|
||||
)
|
||||
@@ -523,10 +517,17 @@ def cli_eval(
|
||||
if print_detailed_results:
|
||||
for eval_result in eval_results:
|
||||
eval_result: EvalCaseResult
|
||||
print(
|
||||
click.echo(
|
||||
"*********************************************************************"
|
||||
)
|
||||
print(eval_result.model_dump_json(indent=2))
|
||||
click.echo(
|
||||
eval_result.model_dump_json(
|
||||
indent=2,
|
||||
exclude_unset=True,
|
||||
exclude_defaults=True,
|
||||
exclude_none=True,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def adk_services_options():
|
||||
@@ -1010,7 +1011,8 @@ def cli_deploy_cloud_run(
|
||||
|
||||
adk deploy cloud_run --project=[project] --region=[region] path/to/my_agent
|
||||
|
||||
adk deploy cloud_run --project=[project] --region=[region] path/to/my_agent -- --no-allow-unauthenticated --min-instances=2
|
||||
adk deploy cloud_run --project=[project] --region=[region] path/to/my_agent
|
||||
-- --no-allow-unauthenticated --min-instances=2
|
||||
"""
|
||||
if verbosity:
|
||||
click.secho(
|
||||
@@ -1222,7 +1224,8 @@ def cli_deploy_agent_engine(
|
||||
Example:
|
||||
|
||||
adk deploy agent_engine --project=[project] --region=[region]
|
||||
--staging_bucket=[staging_bucket] --display_name=[app_name] path/to/my_agent
|
||||
--staging_bucket=[staging_bucket] --display_name=[app_name]
|
||||
path/to/my_agent
|
||||
"""
|
||||
try:
|
||||
cli_deploy.to_agent_engine(
|
||||
@@ -1367,7 +1370,8 @@ def cli_deploy_gke(
|
||||
|
||||
Example:
|
||||
|
||||
adk deploy gke --project=[project] --region=[region] --cluster_name=[cluster_name] path/to/my_agent
|
||||
adk deploy gke --project=[project] --region=[region]
|
||||
--cluster_name=[cluster_name] path/to/my_agent
|
||||
"""
|
||||
try:
|
||||
cli_deploy.to_gke(
|
||||
|
||||
@@ -0,0 +1,26 @@
|
||||
# Copyright 2025 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pydantic
|
||||
from pydantic import alias_generators
|
||||
|
||||
|
||||
class EvalBaseModel(pydantic.BaseModel):
|
||||
model_config = pydantic.ConfigDict(
|
||||
alias_generator=alias_generators.to_camel,
|
||||
populate_by_name=True,
|
||||
extra='forbid',
|
||||
)
|
||||
@@ -0,0 +1,66 @@
|
||||
# Copyright 2025 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Union
|
||||
|
||||
from pydantic import alias_generators
|
||||
from pydantic import BaseModel
|
||||
from pydantic import ConfigDict
|
||||
from pydantic import Field
|
||||
|
||||
from .eval_metrics import BaseCriterion
|
||||
from .eval_metrics import Threshold
|
||||
|
||||
|
||||
class EvalConfig(BaseModel):
|
||||
"""Configurations needed to run an Eval.
|
||||
|
||||
Allows users to specify metrics, their thresholds and other properties.
|
||||
"""
|
||||
|
||||
model_config = ConfigDict(
|
||||
alias_generator=alias_generators.to_camel,
|
||||
populate_by_name=True,
|
||||
)
|
||||
|
||||
criteria: dict[str, Union[Threshold, BaseCriterion]] = Field(
|
||||
default_factory=dict,
|
||||
description="""A dictionary that maps criterion to be used for a metric.
|
||||
|
||||
The key of the dictionary is the name of the eval metric and the value is the
|
||||
criterion to be used.
|
||||
|
||||
In the sample below, `tool_trajectory_avg_score`, `response_match_score` and
|
||||
`final_response_match_v2` are the standard eval metric names, represented as
|
||||
keys in the dictionary. The values in the dictionary are the corresponding
|
||||
criterions. For the first two metrics, we use simple threshold as the criterion,
|
||||
the third one uses `LlmAsAJudgeCriterion`.
|
||||
{
|
||||
"criteria": {
|
||||
"tool_trajectory_avg_score": 1.0,
|
||||
"response_match_score": 0.5,
|
||||
"final_response_match_v2": {
|
||||
"threshold": 0.5,
|
||||
"judge_model_options": {
|
||||
"judge_model": "my favorite LLM",
|
||||
"num_samples": 5
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
""",
|
||||
)
|
||||
@@ -25,8 +25,16 @@ from pydantic import ConfigDict
|
||||
from pydantic import Field
|
||||
from typing_extensions import TypeAlias
|
||||
|
||||
from .common import EvalBaseModel
|
||||
from .eval_case import Invocation
|
||||
from .evaluator import EvalStatus
|
||||
from .eval_rubrics import Rubric
|
||||
from .eval_rubrics import RubricScore
|
||||
|
||||
|
||||
class EvalStatus(Enum):
|
||||
PASSED = 1
|
||||
FAILED = 2
|
||||
NOT_EVALUATED = 3
|
||||
|
||||
|
||||
class PrebuiltMetrics(Enum):
|
||||
@@ -42,9 +50,10 @@ class PrebuiltMetrics(Enum):
|
||||
|
||||
|
||||
MetricName: TypeAlias = Union[str, PrebuiltMetrics]
|
||||
Threshold: TypeAlias = float
|
||||
|
||||
|
||||
class JudgeModelOptions(BaseModel):
|
||||
class JudgeModelOptions(EvalBaseModel):
|
||||
"""Options for an eval metric's judge model."""
|
||||
|
||||
judge_model: str = Field(
|
||||
@@ -55,27 +64,69 @@ class JudgeModelOptions(BaseModel):
|
||||
)
|
||||
|
||||
judge_model_config: Optional[genai_types.GenerateContentConfig] = Field(
|
||||
default=None,
|
||||
default=genai_types.GenerateContentConfig,
|
||||
description="The configuration for the judge model.",
|
||||
)
|
||||
|
||||
num_samples: Optional[int] = Field(
|
||||
default=None,
|
||||
num_samples: int = Field(
|
||||
default=5,
|
||||
description=(
|
||||
"The number of times to sample the model for each invocation"
|
||||
" evaluation."
|
||||
" evaluation. Given that models tend to have certain degree of"
|
||||
" unreliability to them, we repeatedly sample them with the same"
|
||||
" data. These repeated invocation are them aggregated using some"
|
||||
" strategy. From experimentation, we have found 5 to be a good"
|
||||
" default."
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
class EvalMetric(BaseModel):
|
||||
"""A metric used to evaluate a particular aspect of an eval case."""
|
||||
class BaseCriterion(BaseModel):
|
||||
"""Base creterion to use for an Eval Metric."""
|
||||
|
||||
model_config = ConfigDict(
|
||||
alias_generator=alias_generators.to_camel,
|
||||
populate_by_name=True,
|
||||
extra="allow",
|
||||
)
|
||||
|
||||
threshold: Threshold = Field(
|
||||
description="The threshold to be used by the metric.",
|
||||
)
|
||||
|
||||
|
||||
class LlmAsAJudgeCriterion(BaseCriterion):
|
||||
"""Criterion when using LLM-As-A-Judge metric."""
|
||||
|
||||
judge_model_options: JudgeModelOptions = Field(
|
||||
default_factory=JudgeModelOptions,
|
||||
description="Options for the judge model.",
|
||||
)
|
||||
|
||||
|
||||
class RubricsBasedCriterion(BaseCriterion):
|
||||
"""Criterion when using a rubric based metric."""
|
||||
|
||||
judge_model_options: JudgeModelOptions = Field(
|
||||
default_factory=JudgeModelOptions,
|
||||
description="Options for the judge model.",
|
||||
)
|
||||
|
||||
rubrics: list[Rubric] = Field(
|
||||
default_factory=list,
|
||||
description=(
|
||||
"Rubrics to be used by Metric. Not all metrics rely on rubrics, but"
|
||||
" metrics like `rubric_based_final_response_quality_v1` do. Metrics"
|
||||
" that don't use Rubrics, will just ignore this field, if specified."
|
||||
" Metrics that do use rubrics will raise an execption, if they are"
|
||||
" not specified."
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
class EvalMetric(EvalBaseModel):
|
||||
"""A metric used to evaluate a particular aspect of an eval case."""
|
||||
|
||||
metric_name: str = Field(
|
||||
description="The name of the metric.",
|
||||
)
|
||||
@@ -88,19 +139,33 @@ class EvalMetric(BaseModel):
|
||||
)
|
||||
|
||||
judge_model_options: Optional[JudgeModelOptions] = Field(
|
||||
deprecated=True,
|
||||
default=None,
|
||||
description="Options for the judge model.",
|
||||
description=(
|
||||
"[DEPRECATED] This field is deprecated in favor of `criterion`."
|
||||
" Depending on the metric you may want to one of the sub-classes of"
|
||||
" BaseCriterion."
|
||||
),
|
||||
)
|
||||
|
||||
criterion: Optional[BaseCriterion] = Field(
|
||||
default=None, description="""Evaluation criterion used by the metric."""
|
||||
)
|
||||
|
||||
|
||||
class EvalMetricResultDetails(EvalBaseModel):
|
||||
rubric_scores: Optional[list[RubricScore]] = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"The scores obtained after applying the rubrics to the Agent's"
|
||||
" response."
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
class EvalMetricResult(EvalMetric):
|
||||
"""The actual computed score/value of a particular EvalMetric."""
|
||||
|
||||
model_config = ConfigDict(
|
||||
alias_generator=alias_generators.to_camel,
|
||||
populate_by_name=True,
|
||||
)
|
||||
|
||||
score: Optional[float] = Field(
|
||||
default=None,
|
||||
description=(
|
||||
@@ -108,17 +173,17 @@ class EvalMetricResult(EvalMetric):
|
||||
" might not have happened."
|
||||
),
|
||||
)
|
||||
|
||||
eval_status: EvalStatus = Field(description="The status of this evaluation.")
|
||||
|
||||
|
||||
class EvalMetricResultPerInvocation(BaseModel):
|
||||
"""Eval metric results per invocation."""
|
||||
|
||||
model_config = ConfigDict(
|
||||
alias_generator=alias_generators.to_camel,
|
||||
populate_by_name=True,
|
||||
details: EvalMetricResultDetails = Field(
|
||||
default_factory=EvalMetricResultDetails, description=""""""
|
||||
)
|
||||
|
||||
|
||||
class EvalMetricResultPerInvocation(EvalBaseModel):
|
||||
"""Eval metric results per invocation."""
|
||||
|
||||
actual_invocation: Invocation = Field(
|
||||
description=(
|
||||
"The actual invocation, usually obtained by inferencing the agent."
|
||||
@@ -137,7 +202,7 @@ class EvalMetricResultPerInvocation(BaseModel):
|
||||
)
|
||||
|
||||
|
||||
class Interval(BaseModel):
|
||||
class Interval(EvalBaseModel):
|
||||
"""Represents a range of numeric values, e.g. [0 ,1] or (2,3) or [-1, 6)."""
|
||||
|
||||
min_value: float = Field(description="The smaller end of the interval.")
|
||||
@@ -161,7 +226,7 @@ class Interval(BaseModel):
|
||||
)
|
||||
|
||||
|
||||
class MetricValueInfo(BaseModel):
|
||||
class MetricValueInfo(EvalBaseModel):
|
||||
"""Information about the type of metric value."""
|
||||
|
||||
interval: Optional[Interval] = Field(
|
||||
@@ -170,14 +235,9 @@ class MetricValueInfo(BaseModel):
|
||||
)
|
||||
|
||||
|
||||
class MetricInfo(BaseModel):
|
||||
class MetricInfo(EvalBaseModel):
|
||||
"""Information about the metric that are used for Evals."""
|
||||
|
||||
model_config = ConfigDict(
|
||||
alias_generator=alias_generators.to_camel,
|
||||
populate_by_name=True,
|
||||
)
|
||||
|
||||
metric_name: str = Field(description="The name of the metric.")
|
||||
|
||||
description: str = Field(
|
||||
|
||||
@@ -0,0 +1,82 @@
|
||||
# Copyright 2025 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import Field
|
||||
|
||||
from .common import EvalBaseModel
|
||||
|
||||
|
||||
class RubricContent(EvalBaseModel):
|
||||
"""The content of a rubric."""
|
||||
|
||||
text_property: Optional[str] = Field(
|
||||
description=(
|
||||
"The property being evaluated. Example: \"The agent's response is"
|
||||
' grammatically correct." '
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class Rubric(EvalBaseModel):
|
||||
"""This class represents a single Rubric."""
|
||||
|
||||
rubric_id: str = Field(
|
||||
description="Unique identifier for the rubric.",
|
||||
)
|
||||
|
||||
rubric_content: RubricContent = Field(
|
||||
description="The actual testable criterion for the rubric."
|
||||
)
|
||||
|
||||
description: Optional[str] = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"A description of the rubric that provide details on how the results"
|
||||
" of the rubric assessment be interpreted."
|
||||
),
|
||||
)
|
||||
|
||||
type: Optional[str] = Field(
|
||||
default=None,
|
||||
description="""Optional. A type designator for the rubric, which can
|
||||
inform how it's evaluated or interpreted by systems or users.
|
||||
|
||||
It's recommended to use consistent, well-defined, upper snake_case
|
||||
strings.
|
||||
|
||||
Examples: "TOOL_USE_QUALITY", "FINAL_RESPONSE_QUALITY",
|
||||
"INSTRUCTION_ADHERENCE".""",
|
||||
)
|
||||
|
||||
|
||||
class RubricScore(EvalBaseModel):
|
||||
"""The score obtained after applying the rubric to the Agent's response."""
|
||||
|
||||
rubric_id: str = Field(description="The id of the rubric that was assessed.")
|
||||
|
||||
rationale: Optional[str] = Field(
|
||||
default=None, description="Reasoning/rationale for the score."
|
||||
)
|
||||
|
||||
score: Optional[float] = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"Score obtained after assessing the rubric. Optional, as assessment"
|
||||
" might not have happened."
|
||||
),
|
||||
)
|
||||
@@ -11,20 +11,21 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import ABC
|
||||
from enum import Enum
|
||||
from typing import ClassVar
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel
|
||||
from typing_extensions import TypeAlias
|
||||
|
||||
from .eval_case import Invocation
|
||||
from .eval_metrics import BaseCriterion
|
||||
from .eval_metrics import EvalStatus
|
||||
|
||||
|
||||
class EvalStatus(Enum):
|
||||
PASSED = 1
|
||||
FAILED = 2
|
||||
NOT_EVALUATED = 3
|
||||
# Redefining the type here for backward compatibility.
|
||||
EvalStatus: TypeAlias = EvalStatus
|
||||
|
||||
|
||||
class PerInvocationResult(BaseModel):
|
||||
@@ -49,6 +50,8 @@ class EvaluationResult(BaseModel):
|
||||
class Evaluator(ABC):
|
||||
"""A merics evaluator interface."""
|
||||
|
||||
criterion_type: ClassVar[type[BaseCriterion]] = BaseCriterion
|
||||
|
||||
def evaluate_invocations(
|
||||
self,
|
||||
actual_invocations: list[Invocation],
|
||||
|
||||
@@ -16,6 +16,7 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import ClassVar
|
||||
from typing import Optional
|
||||
|
||||
from typing_extensions import override
|
||||
@@ -24,11 +25,12 @@ from ..models.llm_response import LlmResponse
|
||||
from ..utils.feature_decorator import experimental
|
||||
from .eval_case import Invocation
|
||||
from .eval_metrics import EvalMetric
|
||||
from .eval_metrics import EvalStatus
|
||||
from .eval_metrics import Interval
|
||||
from .eval_metrics import LlmAsAJudgeCriterion
|
||||
from .eval_metrics import MetricInfo
|
||||
from .eval_metrics import MetricValueInfo
|
||||
from .eval_metrics import PrebuiltMetrics
|
||||
from .evaluator import EvalStatus
|
||||
from .evaluator import EvaluationResult
|
||||
from .evaluator import PerInvocationResult
|
||||
from .llm_as_judge import LlmAsJudge
|
||||
@@ -79,8 +81,6 @@ The answer should be a json alone which follows the json structure below:
|
||||
Answer with assertiveness:
|
||||
"""
|
||||
|
||||
_DEFAULT_NUM_SAMPLES = 5
|
||||
|
||||
|
||||
def _parse_critique(response: str) -> Label:
|
||||
"""Parses the judge model critique and extracts the final label.
|
||||
@@ -140,15 +140,14 @@ class FinalResponseMatchV2Evaluator(LlmAsJudge):
|
||||
score indicate better final response performance of the agent.
|
||||
"""
|
||||
|
||||
criterion_type: ClassVar[type[LlmAsAJudgeCriterion]] = LlmAsAJudgeCriterion
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
eval_metric: EvalMetric,
|
||||
):
|
||||
super().__init__(eval_metric)
|
||||
super().__init__(eval_metric, FinalResponseMatchV2Evaluator.criterion_type)
|
||||
self._auto_rater_prompt_template = _FINAL_RESPONSE_MATCH_V2_PROMPT
|
||||
assert self._eval_metric.judge_model_options is not None
|
||||
if self._eval_metric.judge_model_options.num_samples is None:
|
||||
self._eval_metric.judge_model_options.num_samples = _DEFAULT_NUM_SAMPLES
|
||||
|
||||
@staticmethod
|
||||
def get_metric_info() -> MetricInfo:
|
||||
@@ -241,7 +240,7 @@ class FinalResponseMatchV2Evaluator(LlmAsJudge):
|
||||
return EvaluationResult(
|
||||
overall_score=overall_score,
|
||||
overall_eval_status=get_eval_status(
|
||||
overall_score, self._eval_metric.threshold
|
||||
overall_score, self._criterion.threshold
|
||||
),
|
||||
per_invocation_results=per_invocation_results,
|
||||
)
|
||||
|
||||
@@ -18,6 +18,7 @@ from abc import abstractmethod
|
||||
from typing import Optional
|
||||
|
||||
from google.genai import types as genai_types
|
||||
from pydantic import ValidationError
|
||||
from typing_extensions import override
|
||||
|
||||
from ..models.base_llm import BaseLlm
|
||||
@@ -26,6 +27,7 @@ from ..models.llm_response import LlmResponse
|
||||
from ..models.registry import LLMRegistry
|
||||
from ..utils.context_utils import Aclosing
|
||||
from .eval_case import Invocation
|
||||
from .eval_metrics import BaseCriterion
|
||||
from .eval_metrics import EvalMetric
|
||||
from .evaluator import EvaluationResult
|
||||
from .evaluator import Evaluator
|
||||
@@ -49,17 +51,26 @@ class LlmAsJudge(Evaluator):
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
eval_metric: EvalMetric,
|
||||
self, eval_metric: EvalMetric, criterion_type: type[BaseCriterion]
|
||||
):
|
||||
self._eval_metric = eval_metric
|
||||
if not eval_metric.judge_model_options:
|
||||
raise ValueError("Judge model options is required for LlmAsJudge.")
|
||||
self._judge_model_options = eval_metric.judge_model_options
|
||||
if self._judge_model_options.judge_model_config is None:
|
||||
self._judge_model_options.judge_model_config = (
|
||||
genai_types.GenerateContentConfig()
|
||||
|
||||
expected_criterion_type_error = ValueError(
|
||||
f"`{eval_metric.metric_name}` metric expects a criterion of type"
|
||||
f" `{criterion_type}`."
|
||||
)
|
||||
|
||||
try:
|
||||
if self._eval_metric.criterion is None:
|
||||
raise expected_criterion_type_error
|
||||
|
||||
self._criterion = criterion_type.model_validate(
|
||||
self._eval_metric.criterion.model_dump()
|
||||
)
|
||||
except ValidationError as e:
|
||||
raise expected_criterion_type_error from e
|
||||
|
||||
self._judge_model_options = self._criterion.judge_model_options
|
||||
self._judge_model = self._setup_auto_rater()
|
||||
|
||||
@abstractmethod
|
||||
@@ -122,7 +133,7 @@ class LlmAsJudge(Evaluator):
|
||||
expected_invocation=expected,
|
||||
score=score,
|
||||
eval_status=get_eval_status(
|
||||
score, self._eval_metric.threshold
|
||||
score, self._criterion.threshold
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
@@ -20,7 +20,6 @@ from ..errors.not_found_error import NotFoundError
|
||||
from ..utils.feature_decorator import experimental
|
||||
from .eval_metrics import EvalMetric
|
||||
from .eval_metrics import MetricInfo
|
||||
from .eval_metrics import MetricName
|
||||
from .eval_metrics import PrebuiltMetrics
|
||||
from .evaluator import Evaluator
|
||||
from .final_response_match_v2 import FinalResponseMatchV2Evaluator
|
||||
|
||||
@@ -0,0 +1,96 @@
|
||||
# Copyright 2025 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest import mock
|
||||
|
||||
from google.adk.cli.cli_eval import _DEFAULT_EVAL_CONFIG
|
||||
from google.adk.cli.cli_eval import get_eval_metrics_from_config
|
||||
from google.adk.cli.cli_eval import get_evaluation_criteria_or_default
|
||||
from google.adk.evaluation.eval_config import EvalConfig
|
||||
from google.adk.evaluation.eval_rubrics import Rubric
|
||||
from google.adk.evaluation.eval_rubrics import RubricContent
|
||||
|
||||
|
||||
def test_get_evaluation_criteria_or_default_returns_default():
|
||||
assert get_evaluation_criteria_or_default("") == _DEFAULT_EVAL_CONFIG
|
||||
|
||||
|
||||
def test_get_evaluation_criteria_or_default_reads_from_file():
|
||||
eval_config = EvalConfig(
|
||||
criteria={"tool_trajectory_avg_score": 0.5, "response_match_score": 0.5}
|
||||
)
|
||||
mock_open = mock.mock_open(read_data=eval_config.model_dump_json())
|
||||
with mock.patch("builtins.open", mock_open):
|
||||
assert get_evaluation_criteria_or_default("dummy_path") == eval_config
|
||||
|
||||
|
||||
def test_get_eval_metrics_from_config():
|
||||
rubric_1 = Rubric(
|
||||
rubric_id="test-rubric",
|
||||
rubric_content=RubricContent(text_property="test"),
|
||||
)
|
||||
eval_config = EvalConfig(
|
||||
criteria={
|
||||
"tool_trajectory_avg_score": 1.0,
|
||||
"response_match_score": 0.8,
|
||||
"final_response_match_v2": {
|
||||
"threshold": 0.5,
|
||||
"judge_model_options": {
|
||||
"judge_model": "gemini-pro",
|
||||
"num_samples": 1,
|
||||
},
|
||||
},
|
||||
"rubric_based_final_response_quality_v1": {
|
||||
"threshold": 0.9,
|
||||
"judge_model_options": {
|
||||
"judge_model": "gemini-ultra",
|
||||
"num_samples": 1,
|
||||
},
|
||||
"rubrics": [rubric_1],
|
||||
},
|
||||
}
|
||||
)
|
||||
eval_metrics = get_eval_metrics_from_config(eval_config)
|
||||
|
||||
assert len(eval_metrics) == 4
|
||||
assert eval_metrics[0].metric_name == "tool_trajectory_avg_score"
|
||||
assert eval_metrics[0].threshold == 1.0
|
||||
assert eval_metrics[0].criterion.threshold == 1.0
|
||||
assert eval_metrics[1].metric_name == "response_match_score"
|
||||
assert eval_metrics[1].threshold == 0.8
|
||||
assert eval_metrics[1].criterion.threshold == 0.8
|
||||
assert eval_metrics[2].metric_name == "final_response_match_v2"
|
||||
assert eval_metrics[2].threshold == 0.5
|
||||
assert eval_metrics[2].criterion.threshold == 0.5
|
||||
assert (
|
||||
eval_metrics[2].criterion.judge_model_options["judge_model"]
|
||||
== "gemini-pro"
|
||||
)
|
||||
assert eval_metrics[3].metric_name == "rubric_based_final_response_quality_v1"
|
||||
assert eval_metrics[3].threshold == 0.9
|
||||
assert eval_metrics[3].criterion.threshold == 0.9
|
||||
assert (
|
||||
eval_metrics[3].criterion.judge_model_options["judge_model"]
|
||||
== "gemini-ultra"
|
||||
)
|
||||
assert len(eval_metrics[3].criterion.rubrics) == 1
|
||||
assert eval_metrics[3].criterion.rubrics[0] == rubric_1
|
||||
|
||||
|
||||
def test_get_eval_metrics_from_config_empty_criteria():
|
||||
eval_config = EvalConfig(criteria={})
|
||||
eval_metrics = get_eval_metrics_from_config(eval_config)
|
||||
assert not eval_metrics
|
||||
@@ -840,6 +840,7 @@ def test_run_eval(test_app, create_test_eval_set):
|
||||
"threshold": 0.5,
|
||||
"score": 1.0,
|
||||
"evalStatus": 1,
|
||||
"details": {},
|
||||
}],
|
||||
}
|
||||
for k, v in expected_eval_case_result.items():
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from google.adk.evaluation.eval_case import Invocation
|
||||
from google.adk.evaluation.eval_metrics import BaseCriterion
|
||||
from google.adk.evaluation.eval_metrics import EvalMetric
|
||||
from google.adk.evaluation.eval_metrics import JudgeModelOptions
|
||||
from google.adk.evaluation.eval_metrics import PrebuiltMetrics
|
||||
@@ -130,9 +131,8 @@ def _create_test_evaluator_gemini(
|
||||
EvalMetric(
|
||||
metric_name="final_response_match_v2",
|
||||
threshold=threshold,
|
||||
judge_model_options=JudgeModelOptions(
|
||||
judge_model="gemini-2.5-flash",
|
||||
num_samples=3,
|
||||
criterion=BaseCriterion(
|
||||
threshold=0.5,
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
@@ -20,6 +20,7 @@ from unittest.mock import MagicMock
|
||||
from google.adk.evaluation.eval_case import Invocation
|
||||
from google.adk.evaluation.eval_metrics import EvalMetric
|
||||
from google.adk.evaluation.eval_metrics import JudgeModelOptions
|
||||
from google.adk.evaluation.eval_metrics import LlmAsAJudgeCriterion
|
||||
from google.adk.evaluation.evaluator import EvalStatus
|
||||
from google.adk.evaluation.evaluator import EvaluationResult
|
||||
from google.adk.evaluation.evaluator import PerInvocationResult
|
||||
@@ -60,15 +61,19 @@ class MockLlmAsJudge(LlmAsJudge):
|
||||
@pytest.fixture
|
||||
def mock_llm_as_judge():
|
||||
return MockLlmAsJudge(
|
||||
EvalMetric(
|
||||
eval_metric=EvalMetric(
|
||||
metric_name="test_metric",
|
||||
threshold=0.5,
|
||||
judge_model_options=JudgeModelOptions(
|
||||
judge_model="gemini-2.5-flash",
|
||||
judge_model_config=genai_types.GenerateContentConfig(),
|
||||
num_samples=3,
|
||||
criterion=LlmAsAJudgeCriterion(
|
||||
threshold=0.5,
|
||||
judge_model_options=JudgeModelOptions(
|
||||
judge_model="gemini-2.5-flash",
|
||||
judge_model_config=genai_types.GenerateContentConfig(),
|
||||
num_samples=3,
|
||||
),
|
||||
),
|
||||
),
|
||||
criterion_type=LlmAsAJudgeCriterion,
|
||||
)
|
||||
|
||||
|
||||
@@ -94,10 +99,11 @@ def test_get_eval_status():
|
||||
assert get_eval_status(score=None, threshold=0.8) == EvalStatus.NOT_EVALUATED
|
||||
|
||||
|
||||
def test_llm_as_judge_init_missing_judge_model_options():
|
||||
def test_llm_as_judge_init_missing_criterion():
|
||||
with pytest.raises(ValueError):
|
||||
MockLlmAsJudge(
|
||||
EvalMetric(metric_name="test_metric", threshold=0.8),
|
||||
criterion_type=LlmAsAJudgeCriterion,
|
||||
)
|
||||
|
||||
|
||||
@@ -107,10 +113,16 @@ def test_llm_as_judge_init_unregistered_model():
|
||||
EvalMetric(
|
||||
metric_name="test_metric",
|
||||
threshold=0.8,
|
||||
judge_model_options=JudgeModelOptions(
|
||||
judge_model="unregistered_model",
|
||||
criterion=LlmAsAJudgeCriterion(
|
||||
threshold=0.5,
|
||||
judge_model_options=JudgeModelOptions(
|
||||
judge_model="unregistered_model",
|
||||
judge_model_config=genai_types.GenerateContentConfig(),
|
||||
num_samples=3,
|
||||
),
|
||||
),
|
||||
),
|
||||
criterion_type=LlmAsAJudgeCriterion,
|
||||
)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user