You've already forked adk-python
mirror of
https://github.com/encounter/adk-python.git
synced 2026-03-30 10:57:20 -07:00
feat: Update EvalConfig and EvalMetric data models to support custom metrics
Co-authored-by: Joseph Pagadora <jcpagadora@google.com> PiperOrigin-RevId: 855517478
This commit is contained in:
committed by
Copybara-Service
parent
905604faac
commit
6d2f33a59c
@@ -23,7 +23,9 @@ from pydantic import alias_generators
|
||||
from pydantic import BaseModel
|
||||
from pydantic import ConfigDict
|
||||
from pydantic import Field
|
||||
from pydantic import model_validator
|
||||
|
||||
from ..agents.common_configs import CodeConfig
|
||||
from ..evaluation.eval_metrics import EvalMetric
|
||||
from .eval_metrics import BaseCriterion
|
||||
from .eval_metrics import Threshold
|
||||
@@ -72,11 +74,46 @@ the third one uses `LlmAsAJudgeCriterion`.
|
||||
""",
|
||||
)
|
||||
|
||||
custom_metrics: Optional[dict[str, CodeConfig]] = Field(
|
||||
default=None,
|
||||
description="""A dictionary mapping custom metric names to CodeConfig
|
||||
objects, which specify the path to the function for each custom metric.
|
||||
|
||||
If a metric name in `criteria` is also present in `custom_metrics`, the
|
||||
corresponding `CodeConfig`'s `name` field will be used to locate the custom
|
||||
metric implementation. The `name` field should contain the fully qualified
|
||||
path to the custom metric function, e.g., `my.custom.metrics.metric_function`.
|
||||
|
||||
Example:
|
||||
{
|
||||
"criteria": {
|
||||
"my_custom_metric": 0.5
|
||||
},
|
||||
"custom_metrics": {
|
||||
"my_custom_metric": {
|
||||
"name": "path.to.my.custom.metric.function"
|
||||
}
|
||||
}
|
||||
}
|
||||
""",
|
||||
)
|
||||
|
||||
user_simulator_config: Optional[BaseUserSimulatorConfig] = Field(
|
||||
default=None,
|
||||
description="Config to be used by the user simulator.",
|
||||
)
|
||||
|
||||
@model_validator(mode="after")
|
||||
def check_custom_metrics_code_config_args(self) -> "EvalConfig":
|
||||
if self.custom_metrics:
|
||||
for metric_name, metric_config in self.custom_metrics.items():
|
||||
if metric_config.args:
|
||||
raise ValueError(
|
||||
f"args field in CodeConfig for custom metric '{metric_name}' is"
|
||||
" not supported."
|
||||
)
|
||||
return self
|
||||
|
||||
|
||||
_DEFAULT_EVAL_CONFIG = EvalConfig(
|
||||
criteria={"tool_trajectory_avg_score": 1.0, "response_match_score": 0.8}
|
||||
@@ -106,12 +143,20 @@ def get_eval_metrics_from_config(eval_config: EvalConfig) -> list[EvalMetric]:
|
||||
eval_metric_list = []
|
||||
if eval_config.criteria:
|
||||
for metric_name, criterion in eval_config.criteria.items():
|
||||
custom_function_path = None
|
||||
if (
|
||||
eval_config.custom_metrics
|
||||
and metric_name in eval_config.custom_metrics
|
||||
):
|
||||
custom_function_path = eval_config.custom_metrics[metric_name].name
|
||||
|
||||
if isinstance(criterion, float):
|
||||
eval_metric_list.append(
|
||||
EvalMetric(
|
||||
metric_name=metric_name,
|
||||
threshold=criterion,
|
||||
criterion=BaseCriterion(threshold=criterion),
|
||||
custom_function_path=custom_function_path,
|
||||
)
|
||||
)
|
||||
elif isinstance(criterion, BaseCriterion):
|
||||
@@ -120,6 +165,7 @@ def get_eval_metrics_from_config(eval_config: EvalConfig) -> list[EvalMetric]:
|
||||
metric_name=metric_name,
|
||||
threshold=criterion.threshold,
|
||||
criterion=criterion,
|
||||
custom_function_path=custom_function_path,
|
||||
)
|
||||
)
|
||||
else:
|
||||
|
||||
@@ -279,6 +279,11 @@ class EvalMetric(EvalBaseModel):
|
||||
default=None, description="""Evaluation criterion used by the metric."""
|
||||
)
|
||||
|
||||
custom_function_path: Optional[str] = Field(
|
||||
default=None,
|
||||
description="""Path to custom function, if this is a custom metric.""",
|
||||
)
|
||||
|
||||
|
||||
class EvalMetricResultDetails(EvalBaseModel):
|
||||
rubric_scores: Optional[list[RubricScore]] = Field(
|
||||
|
||||
@@ -20,6 +20,7 @@ from google.adk.evaluation.eval_config import get_eval_metrics_from_config
|
||||
from google.adk.evaluation.eval_config import get_evaluation_criteria_or_default
|
||||
from google.adk.evaluation.eval_rubrics import Rubric
|
||||
from google.adk.evaluation.eval_rubrics import RubricContent
|
||||
import pytest
|
||||
|
||||
|
||||
def test_get_evaluation_criteria_or_default_returns_default():
|
||||
@@ -99,6 +100,42 @@ def test_get_eval_metrics_from_config():
|
||||
assert eval_metrics[3].criterion.rubrics[0] == rubric_1
|
||||
|
||||
|
||||
def test_get_eval_metrics_from_config_with_custom_metrics():
|
||||
eval_config = EvalConfig(
|
||||
criteria={
|
||||
"custom_metric_1": 1.0,
|
||||
"custom_metric_2": {
|
||||
"threshold": 0.5,
|
||||
},
|
||||
},
|
||||
custom_metrics={
|
||||
"custom_metric_1": {"name": "path/to/custom/metric_1"},
|
||||
"custom_metric_2": {"name": "path/to/custom/metric_2"},
|
||||
},
|
||||
)
|
||||
eval_metrics = get_eval_metrics_from_config(eval_config)
|
||||
|
||||
assert len(eval_metrics) == 2
|
||||
assert eval_metrics[0].metric_name == "custom_metric_1"
|
||||
assert eval_metrics[0].threshold == 1.0
|
||||
assert eval_metrics[0].criterion.threshold == 1.0
|
||||
assert eval_metrics[0].custom_function_path == "path/to/custom/metric_1"
|
||||
assert eval_metrics[1].metric_name == "custom_metric_2"
|
||||
assert eval_metrics[1].threshold == 0.5
|
||||
assert eval_metrics[1].criterion.threshold == 0.5
|
||||
assert eval_metrics[1].custom_function_path == "path/to/custom/metric_2"
|
||||
|
||||
|
||||
def test_custom_metric_code_config_with_args_raises_error():
|
||||
with pytest.raises(ValueError):
|
||||
eval_config = EvalConfig(
|
||||
criteria={"custom_metric": 1.0},
|
||||
custom_metrics={
|
||||
"custom_metric": {"name": "name", "args": [{"value": 1}]}
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def test_get_eval_metrics_from_config_empty_criteria():
|
||||
eval_config = EvalConfig(criteria={})
|
||||
eval_metrics = get_eval_metrics_from_config(eval_config)
|
||||
|
||||
Reference in New Issue
Block a user