feat: Update EvalConfig and EvalMetric data models to support custom metrics

Co-authored-by: Joseph Pagadora <jcpagadora@google.com> PiperOrigin-RevId: 855517478
2026-03-30 10:57:20 -07:00 · 2026-01-12 21:36:44 -08:00
parent 905604faac
commit 6d2f33a59c
3 changed files with 88 additions and 0 deletions
@@ -23,7 +23,9 @@ from pydantic import alias_generators
 from pydantic import BaseModel
 from pydantic import ConfigDict
 from pydantic import Field
+from pydantic import model_validator

+from ..agents.common_configs import CodeConfig
 from ..evaluation.eval_metrics import EvalMetric
 from .eval_metrics import BaseCriterion
 from .eval_metrics import Threshold
@@ -72,11 +74,46 @@ the third one uses `LlmAsAJudgeCriterion`.
 """,
  )

+  custom_metrics: Optional[dict[str, CodeConfig]] = Field(
+      default=None,
+      description="""A dictionary mapping custom metric names to CodeConfig
+objects, which specify the path to the function for each custom metric.
+
+If a metric name in `criteria` is also present in `custom_metrics`, the
+corresponding `CodeConfig`'s `name` field will be used to locate the custom
+metric implementation. The `name` field should contain the fully qualified
+path to the custom metric function, e.g., `my.custom.metrics.metric_function`.
+
+Example:
+{
+  "criteria": {
+    "my_custom_metric": 0.5
+  },
+  "custom_metrics": {
+    "my_custom_metric": {
+      "name": "path.to.my.custom.metric.function"
+    }
+  }
+}
+""",
+  )
+
  user_simulator_config: Optional[BaseUserSimulatorConfig] = Field(
      default=None,
      description="Config to be used by the user simulator.",
  )

+  @model_validator(mode="after")
+  def check_custom_metrics_code_config_args(self) -> "EvalConfig":
+    if self.custom_metrics:
+      for metric_name, metric_config in self.custom_metrics.items():
+        if metric_config.args:
+          raise ValueError(
+              f"args field in CodeConfig for custom metric '{metric_name}' is"
+              " not supported."
+          )
+    return self
+

 _DEFAULT_EVAL_CONFIG = EvalConfig(
    criteria={"tool_trajectory_avg_score": 1.0, "response_match_score": 0.8}
@@ -106,12 +143,20 @@ def get_eval_metrics_from_config(eval_config: EvalConfig) -> list[EvalMetric]:
  eval_metric_list = []
  if eval_config.criteria:
    for metric_name, criterion in eval_config.criteria.items():
+      custom_function_path = None
+      if (
+          eval_config.custom_metrics
+          and metric_name in eval_config.custom_metrics
+      ):
+        custom_function_path = eval_config.custom_metrics[metric_name].name
+
      if isinstance(criterion, float):
        eval_metric_list.append(
            EvalMetric(
                metric_name=metric_name,
                threshold=criterion,
                criterion=BaseCriterion(threshold=criterion),
+                custom_function_path=custom_function_path,
            )
        )
      elif isinstance(criterion, BaseCriterion):
@@ -120,6 +165,7 @@ def get_eval_metrics_from_config(eval_config: EvalConfig) -> list[EvalMetric]:
                metric_name=metric_name,
                threshold=criterion.threshold,
                criterion=criterion,
+                custom_function_path=custom_function_path,
            )
        )
      else:
@@ -279,6 +279,11 @@ class EvalMetric(EvalBaseModel):
      default=None, description="""Evaluation criterion used by the metric."""
  )

+  custom_function_path: Optional[str] = Field(
+      default=None,
+      description="""Path to custom function, if this is a custom metric.""",
+  )
+

 class EvalMetricResultDetails(EvalBaseModel):
  rubric_scores: Optional[list[RubricScore]] = Field(
@@ -20,6 +20,7 @@ from google.adk.evaluation.eval_config import get_eval_metrics_from_config
 from google.adk.evaluation.eval_config import get_evaluation_criteria_or_default
 from google.adk.evaluation.eval_rubrics import Rubric
 from google.adk.evaluation.eval_rubrics import RubricContent
+import pytest


 def test_get_evaluation_criteria_or_default_returns_default():
@@ -99,6 +100,42 @@ def test_get_eval_metrics_from_config():
  assert eval_metrics[3].criterion.rubrics[0] == rubric_1


+def test_get_eval_metrics_from_config_with_custom_metrics():
+  eval_config = EvalConfig(
+      criteria={
+          "custom_metric_1": 1.0,
+          "custom_metric_2": {
+              "threshold": 0.5,
+          },
+      },
+      custom_metrics={
+          "custom_metric_1": {"name": "path/to/custom/metric_1"},
+          "custom_metric_2": {"name": "path/to/custom/metric_2"},
+      },
+  )
+  eval_metrics = get_eval_metrics_from_config(eval_config)
+
+  assert len(eval_metrics) == 2
+  assert eval_metrics[0].metric_name == "custom_metric_1"
+  assert eval_metrics[0].threshold == 1.0
+  assert eval_metrics[0].criterion.threshold == 1.0
+  assert eval_metrics[0].custom_function_path == "path/to/custom/metric_1"
+  assert eval_metrics[1].metric_name == "custom_metric_2"
+  assert eval_metrics[1].threshold == 0.5
+  assert eval_metrics[1].criterion.threshold == 0.5
+  assert eval_metrics[1].custom_function_path == "path/to/custom/metric_2"
+
+
+def test_custom_metric_code_config_with_args_raises_error():
+  with pytest.raises(ValueError):
+    eval_config = EvalConfig(
+        criteria={"custom_metric": 1.0},
+        custom_metrics={
+            "custom_metric": {"name": "name", "args": [{"value": 1}]}
+        },
+    )
+
+
 def test_get_eval_metrics_from_config_empty_criteria():
  eval_config = EvalConfig(criteria={})
  eval_metrics = get_eval_metrics_from_config(eval_config)