feat: Data model for Rubric based metric and eval config

Details: - We plan on introducing Rubric based metrics in subsequent changes. This change introduces the data model needed that allows agent developer to provide rubrics. - We also introduce a data model for the config that the eval system has been using for quite some time. It was loosely and informally described as a dictionary of metric names and expected thresholds. In this change, we actually formalize it using a pydantic data model, and extend it allow developers to specify rubrics as a part of their eval config. What is a rubric based metric? A rubric based metric is the assessment of a Agent's response (final or intermediate) along some rubric. This evaluation of agent's response significantly differs from the strategy where one has to provide a golden response. PiperOrigin-RevId: 805488436
2026-03-30 10:57:20 -07:00 · 2025-09-10 13:19:32 -07:00
parent 37228beddd
commit e88e667770
14 changed files with 484 additions and 98 deletions
@@ -37,6 +37,8 @@ from ..evaluation.base_eval_service import InferenceRequest
 from ..evaluation.base_eval_service import InferenceResult
 from ..evaluation.constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
 from ..evaluation.eval_case import EvalCase
+from ..evaluation.eval_config import BaseCriterion
+from ..evaluation.eval_config import EvalConfig
 from ..evaluation.eval_metrics import EvalMetric
 from ..evaluation.eval_metrics import EvalMetricResult
 from ..evaluation.eval_metrics import EvalMetricResultPerInvocation
@@ -64,6 +66,10 @@ DEFAULT_CRITERIA = {
    RESPONSE_MATCH_SCORE_KEY: 0.8,
 }

+_DEFAULT_EVAL_CONFIG = EvalConfig(
+    criteria={"tool_trajectory_avg_score": 1.0, "response_match_score": 0.8}
+)
+

 def _import_from_path(module_name, file_path):
  spec = importlib.util.spec_from_file_location(module_name, file_path)
@@ -81,27 +87,48 @@ def _get_agent_module(agent_module_file_path: str):

 def get_evaluation_criteria_or_default(
    eval_config_file_path: str,
-) -> dict[str, float]:
-  """Returns evaluation criteria from the config file, if present.
+) -> EvalConfig:
+  """Returns EvalConfig read from the config file, if present.

  Otherwise a default one is returned.
  """
  if eval_config_file_path:
    with open(eval_config_file_path, "r", encoding="utf-8") as f:
-      config_data = json.load(f)
+      content = f.read()
+      return EvalConfig.model_validate_json(content)

-    if "criteria" in config_data and isinstance(config_data["criteria"], dict):
-      evaluation_criteria = config_data["criteria"]
-    else:
-      raise ValueError(
-          f"Invalid format for test_config.json at {eval_config_file_path}."
-          " Expected a 'criteria' dictionary."
-      )
-  else:
-    logger.info("No config file supplied. Using default criteria.")
-    evaluation_criteria = DEFAULT_CRITERIA
+  logger.info("No config file supplied. Using default criteria.")
+  return _DEFAULT_EVAL_CONFIG

-  return evaluation_criteria
+
+def get_eval_metrics_from_config(eval_config: EvalConfig) -> list[EvalMetric]:
+  """Returns a list of EvalMetrics mapped from the EvalConfig."""
+  eval_metric_list = []
+  if eval_config.criteria:
+    for metric_name, criterion in eval_config.criteria.items():
+      if isinstance(criterion, float):
+        eval_metric_list.append(
+            EvalMetric(
+                metric_name=metric_name,
+                threshold=criterion,
+                criterion=BaseCriterion(threshold=criterion),
+            )
+        )
+      elif isinstance(criterion, BaseCriterion):
+        eval_metric_list.append(
+            EvalMetric(
+                metric_name=metric_name,
+                threshold=criterion.threshold,
+                criterion=criterion,
+            )
+        )
+      else:
+        raise ValueError(
+            f"Unexpected criterion type. {type(criterion).__name__} not"
+            " supported."
+        )
+
+  return eval_metric_list


 def get_root_agent(agent_module_file_path: str) -> Agent:
@@ -382,24 +382,16 @@ def cli_eval(
    from ..evaluation.local_eval_sets_manager import LocalEvalSetsManager
    from .cli_eval import _collect_eval_results
    from .cli_eval import _collect_inferences
+    from .cli_eval import get_eval_metrics_from_config
    from .cli_eval import get_evaluation_criteria_or_default
    from .cli_eval import get_root_agent
    from .cli_eval import parse_and_get_evals_to_run
  except ModuleNotFoundError as mnf:
    raise click.ClickException(MISSING_EVAL_DEPENDENCIES_MESSAGE) from mnf

-  evaluation_criteria = get_evaluation_criteria_or_default(config_file_path)
-  eval_metrics = []
-  for metric_name, threshold in evaluation_criteria.items():
-    eval_metrics.append(
-        EvalMetric(
-            metric_name=metric_name,
-            threshold=threshold,
-            judge_model_options=JudgeModelOptions(),
-        )
-    )
-
-  print(f"Using evaluation criteria: {evaluation_criteria}")
+  eval_config = get_evaluation_criteria_or_default(config_file_path)
+  print(f"Using evaluation criteria: {eval_config}")
+  eval_metrics = get_eval_metrics_from_config(eval_config)

  root_agent = get_root_agent(agent_module_file_path)
  app_name = os.path.basename(agent_module_file_path)
@@ -500,7 +492,9 @@ def cli_eval(
  except ModuleNotFoundError as mnf:
    raise click.ClickException(MISSING_EVAL_DEPENDENCIES_MESSAGE) from mnf

-  print("*********************************************************************")
+  click.echo(
+      "*********************************************************************"
+  )
  eval_run_summary = {}

  for eval_result in eval_results:
@@ -513,9 +507,9 @@ def cli_eval(
      eval_run_summary[eval_result.eval_set_id][0] += 1
    else:
      eval_run_summary[eval_result.eval_set_id][1] += 1
-  print("Eval Run Summary")
+  click.echo("Eval Run Summary")
  for eval_set_id, pass_fail_count in eval_run_summary.items():
-    print(
+    click.echo(
        f"{eval_set_id}:\n  Tests passed: {pass_fail_count[0]}\n  Tests"
        f" failed: {pass_fail_count[1]}"
    )
@@ -523,10 +517,17 @@ def cli_eval(
  if print_detailed_results:
    for eval_result in eval_results:
      eval_result: EvalCaseResult
-      print(
+      click.echo(
          "*********************************************************************"
      )
-      print(eval_result.model_dump_json(indent=2))
+      click.echo(
+          eval_result.model_dump_json(
+              indent=2,
+              exclude_unset=True,
+              exclude_defaults=True,
+              exclude_none=True,
+          )
+      )


 def adk_services_options():
@@ -1010,7 +1011,8 @@ def cli_deploy_cloud_run(

    adk deploy cloud_run --project=[project] --region=[region] path/to/my_agent

-    adk deploy cloud_run --project=[project] --region=[region] path/to/my_agent -- --no-allow-unauthenticated --min-instances=2
+    adk deploy cloud_run --project=[project] --region=[region] path/to/my_agent
+      -- --no-allow-unauthenticated --min-instances=2
  """
  if verbosity:
    click.secho(
@@ -1222,7 +1224,8 @@ def cli_deploy_agent_engine(
  Example:

    adk deploy agent_engine --project=[project] --region=[region]
-      --staging_bucket=[staging_bucket] --display_name=[app_name] path/to/my_agent
+      --staging_bucket=[staging_bucket] --display_name=[app_name]
+      path/to/my_agent
  """
  try:
    cli_deploy.to_agent_engine(
@@ -1367,7 +1370,8 @@ def cli_deploy_gke(

  Example:

-    adk deploy gke --project=[project] --region=[region] --cluster_name=[cluster_name] path/to/my_agent
+    adk deploy gke --project=[project] --region=[region]
+      --cluster_name=[cluster_name] path/to/my_agent
  """
  try:
    cli_deploy.to_gke(
@@ -0,0 +1,26 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import pydantic
+from pydantic import alias_generators
+
+
+class EvalBaseModel(pydantic.BaseModel):
+  model_config = pydantic.ConfigDict(
+      alias_generator=alias_generators.to_camel,
+      populate_by_name=True,
+      extra='forbid',
+  )
@@ -0,0 +1,66 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from typing import Union
+
+from pydantic import alias_generators
+from pydantic import BaseModel
+from pydantic import ConfigDict
+from pydantic import Field
+
+from .eval_metrics import BaseCriterion
+from .eval_metrics import Threshold
+
+
+class EvalConfig(BaseModel):
+  """Configurations needed to run an Eval.
+
+  Allows users to specify metrics, their thresholds and other properties.
+  """
+
+  model_config = ConfigDict(
+      alias_generator=alias_generators.to_camel,
+      populate_by_name=True,
+  )
+
+  criteria: dict[str, Union[Threshold, BaseCriterion]] = Field(
+      default_factory=dict,
+      description="""A dictionary that maps criterion to be used for a metric.
+
+The key of the dictionary is the name of the eval metric and the value is the
+criterion to be used.
+
+In the sample below, `tool_trajectory_avg_score`, `response_match_score` and
+`final_response_match_v2` are the standard eval metric names, represented as
+keys in the dictionary. The values in the dictionary are the corresponding
+criterions. For the first two metrics, we use simple threshold as the criterion,
+the third one uses `LlmAsAJudgeCriterion`.
+{
+  "criteria": {
+    "tool_trajectory_avg_score": 1.0,
+    "response_match_score": 0.5,
+    "final_response_match_v2": {
+      "threshold": 0.5,
+      "judge_model_options": {
+            "judge_model": "my favorite LLM",
+            "num_samples": 5
+          }
+        }
+    },
+  }
+}
+""",
+  )
@@ -25,8 +25,16 @@ from pydantic import ConfigDict
 from pydantic import Field
 from typing_extensions import TypeAlias

+from .common import EvalBaseModel
 from .eval_case import Invocation
-from .evaluator import EvalStatus
+from .eval_rubrics import Rubric
+from .eval_rubrics import RubricScore
+
+
+class EvalStatus(Enum):
+  PASSED = 1
+  FAILED = 2
+  NOT_EVALUATED = 3


 class PrebuiltMetrics(Enum):
@@ -42,9 +50,10 @@ class PrebuiltMetrics(Enum):


 MetricName: TypeAlias = Union[str, PrebuiltMetrics]
+Threshold: TypeAlias = float


-class JudgeModelOptions(BaseModel):
+class JudgeModelOptions(EvalBaseModel):
  """Options for an eval metric's judge model."""

  judge_model: str = Field(
@@ -55,27 +64,69 @@ class JudgeModelOptions(BaseModel):
  )

  judge_model_config: Optional[genai_types.GenerateContentConfig] = Field(
-      default=None,
+      default=genai_types.GenerateContentConfig,
      description="The configuration for the judge model.",
  )

-  num_samples: Optional[int] = Field(
-      default=None,
+  num_samples: int = Field(
+      default=5,
      description=(
          "The number of times to sample the model for each invocation"
-          " evaluation."
+          " evaluation. Given that models tend to have certain degree of"
+          " unreliability to them, we repeatedly sample them with the same"
+          " data. These repeated invocation are them aggregated using some"
+          " strategy. From experimentation, we have found 5 to be a good"
+          " default."
      ),
  )


-class EvalMetric(BaseModel):
-  """A metric used to evaluate a particular aspect of an eval case."""
+class BaseCriterion(BaseModel):
+  """Base creterion to use for an Eval Metric."""

  model_config = ConfigDict(
      alias_generator=alias_generators.to_camel,
      populate_by_name=True,
+      extra="allow",
  )

+  threshold: Threshold = Field(
+      description="The threshold to be used by the metric.",
+  )
+
+
+class LlmAsAJudgeCriterion(BaseCriterion):
+  """Criterion when using LLM-As-A-Judge metric."""
+
+  judge_model_options: JudgeModelOptions = Field(
+      default_factory=JudgeModelOptions,
+      description="Options for the judge model.",
+  )
+
+
+class RubricsBasedCriterion(BaseCriterion):
+  """Criterion when using a rubric based metric."""
+
+  judge_model_options: JudgeModelOptions = Field(
+      default_factory=JudgeModelOptions,
+      description="Options for the judge model.",
+  )
+
+  rubrics: list[Rubric] = Field(
+      default_factory=list,
+      description=(
+          "Rubrics to be used by Metric. Not all metrics rely on rubrics, but"
+          " metrics like `rubric_based_final_response_quality_v1` do. Metrics"
+          " that don't use Rubrics, will just ignore this field, if specified."
+          " Metrics that do use rubrics will raise an execption, if they are"
+          " not specified."
+      ),
+  )
+
+
+class EvalMetric(EvalBaseModel):
+  """A metric used to evaluate a particular aspect of an eval case."""
+
  metric_name: str = Field(
      description="The name of the metric.",
  )
@@ -88,19 +139,33 @@ class EvalMetric(BaseModel):
  )

  judge_model_options: Optional[JudgeModelOptions] = Field(
+      deprecated=True,
      default=None,
-      description="Options for the judge model.",
+      description=(
+          "[DEPRECATED] This field is deprecated in favor of `criterion`."
+          " Depending on the metric you may want to one of the sub-classes of"
+          " BaseCriterion."
+      ),
+  )
+
+  criterion: Optional[BaseCriterion] = Field(
+      default=None, description="""Evaluation criterion used by the metric."""
+  )
+
+
+class EvalMetricResultDetails(EvalBaseModel):
+  rubric_scores: Optional[list[RubricScore]] = Field(
+      default=None,
+      description=(
+          "The scores obtained after applying the rubrics to the Agent's"
+          " response."
+      ),
  )


 class EvalMetricResult(EvalMetric):
  """The actual computed score/value of a particular EvalMetric."""

-  model_config = ConfigDict(
-      alias_generator=alias_generators.to_camel,
-      populate_by_name=True,
-  )
-
  score: Optional[float] = Field(
      default=None,
      description=(
@@ -108,17 +173,17 @@ class EvalMetricResult(EvalMetric):
          " might not have happened."
      ),
  )
+
  eval_status: EvalStatus = Field(description="The status of this evaluation.")

-
-class EvalMetricResultPerInvocation(BaseModel):
-  """Eval metric results per invocation."""
-
-  model_config = ConfigDict(
-      alias_generator=alias_generators.to_camel,
-      populate_by_name=True,
+  details: EvalMetricResultDetails = Field(
+      default_factory=EvalMetricResultDetails, description=""""""
  )

+
+class EvalMetricResultPerInvocation(EvalBaseModel):
+  """Eval metric results per invocation."""
+
  actual_invocation: Invocation = Field(
      description=(
          "The actual invocation, usually obtained by inferencing the agent."
@@ -137,7 +202,7 @@ class EvalMetricResultPerInvocation(BaseModel):
  )


-class Interval(BaseModel):
+class Interval(EvalBaseModel):
  """Represents a range of numeric values, e.g. [0 ,1] or (2,3) or [-1, 6)."""

  min_value: float = Field(description="The smaller end of the interval.")
@@ -161,7 +226,7 @@ class Interval(BaseModel):
  )


-class MetricValueInfo(BaseModel):
+class MetricValueInfo(EvalBaseModel):
  """Information about the type of metric value."""

  interval: Optional[Interval] = Field(
@@ -170,14 +235,9 @@ class MetricValueInfo(BaseModel):
  )


-class MetricInfo(BaseModel):
+class MetricInfo(EvalBaseModel):
  """Information about the metric that are used for Evals."""

-  model_config = ConfigDict(
-      alias_generator=alias_generators.to_camel,
-      populate_by_name=True,
-  )
-
  metric_name: str = Field(description="The name of the metric.")

  description: str = Field(
@@ -0,0 +1,82 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from typing import Optional
+
+from pydantic import Field
+
+from .common import EvalBaseModel
+
+
+class RubricContent(EvalBaseModel):
+  """The content of a rubric."""
+
+  text_property: Optional[str] = Field(
+      description=(
+          "The property being evaluated. Example: \"The agent's response is"
+          ' grammatically correct." '
+      )
+  )
+
+
+class Rubric(EvalBaseModel):
+  """This class represents a single Rubric."""
+
+  rubric_id: str = Field(
+      description="Unique identifier for the rubric.",
+  )
+
+  rubric_content: RubricContent = Field(
+      description="The actual testable criterion for the rubric."
+  )
+
+  description: Optional[str] = Field(
+      default=None,
+      description=(
+          "A description of the rubric that provide details on how the results"
+          " of the rubric assessment be interpreted."
+      ),
+  )
+
+  type: Optional[str] = Field(
+      default=None,
+      description="""Optional. A type designator for the rubric, which can
+      inform how it's evaluated or interpreted by systems or users.
+
+      It's recommended to use consistent, well-defined, upper snake_case
+      strings.
+
+      Examples: "TOOL_USE_QUALITY", "FINAL_RESPONSE_QUALITY",
+      "INSTRUCTION_ADHERENCE".""",
+  )
+
+
+class RubricScore(EvalBaseModel):
+  """The score obtained after applying the rubric to the Agent's response."""
+
+  rubric_id: str = Field(description="The id of the rubric that was assessed.")
+
+  rationale: Optional[str] = Field(
+      default=None, description="Reasoning/rationale for the score."
+  )
+
+  score: Optional[float] = Field(
+      default=None,
+      description=(
+          "Score obtained after assessing the rubric. Optional, as assessment"
+          " might not have happened."
+      ),
+  )
@@ -11,20 +11,21 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations

 from abc import ABC
-from enum import Enum
+from typing import ClassVar
 from typing import Optional

 from pydantic import BaseModel
+from typing_extensions import TypeAlias

 from .eval_case import Invocation
+from .eval_metrics import BaseCriterion
+from .eval_metrics import EvalStatus

-
-class EvalStatus(Enum):
-  PASSED = 1
-  FAILED = 2
-  NOT_EVALUATED = 3
+# Redefining the type here for backward compatibility.
+EvalStatus: TypeAlias = EvalStatus


 class PerInvocationResult(BaseModel):
@@ -49,6 +50,8 @@ class EvaluationResult(BaseModel):
 class Evaluator(ABC):
  """A merics evaluator interface."""

+  criterion_type: ClassVar[type[BaseCriterion]] = BaseCriterion
+
  def evaluate_invocations(
      self,
      actual_invocations: list[Invocation],
@@ -16,6 +16,7 @@ from __future__ import annotations

 import logging
 import re
+from typing import ClassVar
 from typing import Optional

 from typing_extensions import override
@@ -24,11 +25,12 @@ from ..models.llm_response import LlmResponse
 from ..utils.feature_decorator import experimental
 from .eval_case import Invocation
 from .eval_metrics import EvalMetric
+from .eval_metrics import EvalStatus
 from .eval_metrics import Interval
+from .eval_metrics import LlmAsAJudgeCriterion
 from .eval_metrics import MetricInfo
 from .eval_metrics import MetricValueInfo
 from .eval_metrics import PrebuiltMetrics
-from .evaluator import EvalStatus
 from .evaluator import EvaluationResult
 from .evaluator import PerInvocationResult
 from .llm_as_judge import LlmAsJudge
@@ -79,8 +81,6 @@ The answer should be a json alone which follows the json structure below:
 Answer with assertiveness:
 """

-_DEFAULT_NUM_SAMPLES = 5
-

 def _parse_critique(response: str) -> Label:
  """Parses the judge model critique and extracts the final label.
@@ -140,15 +140,14 @@ class FinalResponseMatchV2Evaluator(LlmAsJudge):
  score indicate better final response performance of the agent.
  """

+  criterion_type: ClassVar[type[LlmAsAJudgeCriterion]] = LlmAsAJudgeCriterion
+
  def __init__(
      self,
      eval_metric: EvalMetric,
  ):
-    super().__init__(eval_metric)
+    super().__init__(eval_metric, FinalResponseMatchV2Evaluator.criterion_type)
    self._auto_rater_prompt_template = _FINAL_RESPONSE_MATCH_V2_PROMPT
-    assert self._eval_metric.judge_model_options is not None
-    if self._eval_metric.judge_model_options.num_samples is None:
-      self._eval_metric.judge_model_options.num_samples = _DEFAULT_NUM_SAMPLES

  @staticmethod
  def get_metric_info() -> MetricInfo:
@@ -241,7 +240,7 @@ class FinalResponseMatchV2Evaluator(LlmAsJudge):
    return EvaluationResult(
        overall_score=overall_score,
        overall_eval_status=get_eval_status(
-            overall_score, self._eval_metric.threshold
+            overall_score, self._criterion.threshold
        ),
        per_invocation_results=per_invocation_results,
    )
@@ -18,6 +18,7 @@ from abc import abstractmethod
 from typing import Optional

 from google.genai import types as genai_types
+from pydantic import ValidationError
 from typing_extensions import override

 from ..models.base_llm import BaseLlm
@@ -26,6 +27,7 @@ from ..models.llm_response import LlmResponse
 from ..models.registry import LLMRegistry
 from ..utils.context_utils import Aclosing
 from .eval_case import Invocation
+from .eval_metrics import BaseCriterion
 from .eval_metrics import EvalMetric
 from .evaluator import EvaluationResult
 from .evaluator import Evaluator
@@ -49,17 +51,26 @@ class LlmAsJudge(Evaluator):
  """

  def __init__(
-      self,
-      eval_metric: EvalMetric,
+      self, eval_metric: EvalMetric, criterion_type: type[BaseCriterion]
  ):
    self._eval_metric = eval_metric
-    if not eval_metric.judge_model_options:
-      raise ValueError("Judge model options is required for LlmAsJudge.")
-    self._judge_model_options = eval_metric.judge_model_options
-    if self._judge_model_options.judge_model_config is None:
-      self._judge_model_options.judge_model_config = (
-          genai_types.GenerateContentConfig()
+
+    expected_criterion_type_error = ValueError(
+        f"`{eval_metric.metric_name}` metric expects a criterion of type"
+        f" `{criterion_type}`."
+    )
+
+    try:
+      if self._eval_metric.criterion is None:
+        raise expected_criterion_type_error
+
+      self._criterion = criterion_type.model_validate(
+          self._eval_metric.criterion.model_dump()
      )
+    except ValidationError as e:
+      raise expected_criterion_type_error from e
+
+    self._judge_model_options = self._criterion.judge_model_options
    self._judge_model = self._setup_auto_rater()

  @abstractmethod
@@ -122,7 +133,7 @@ class LlmAsJudge(Evaluator):
                    expected_invocation=expected,
                    score=score,
                    eval_status=get_eval_status(
-                        score, self._eval_metric.threshold
+                        score, self._criterion.threshold
                    ),
                )
            )
@@ -20,7 +20,6 @@ from ..errors.not_found_error import NotFoundError
 from ..utils.feature_decorator import experimental
 from .eval_metrics import EvalMetric
 from .eval_metrics import MetricInfo
-from .eval_metrics import MetricName
 from .eval_metrics import PrebuiltMetrics
 from .evaluator import Evaluator
 from .final_response_match_v2 import FinalResponseMatchV2Evaluator
@@ -0,0 +1,96 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from unittest import mock
+
+from google.adk.cli.cli_eval import _DEFAULT_EVAL_CONFIG
+from google.adk.cli.cli_eval import get_eval_metrics_from_config
+from google.adk.cli.cli_eval import get_evaluation_criteria_or_default
+from google.adk.evaluation.eval_config import EvalConfig
+from google.adk.evaluation.eval_rubrics import Rubric
+from google.adk.evaluation.eval_rubrics import RubricContent
+
+
+def test_get_evaluation_criteria_or_default_returns_default():
+  assert get_evaluation_criteria_or_default("") == _DEFAULT_EVAL_CONFIG
+
+
+def test_get_evaluation_criteria_or_default_reads_from_file():
+  eval_config = EvalConfig(
+      criteria={"tool_trajectory_avg_score": 0.5, "response_match_score": 0.5}
+  )
+  mock_open = mock.mock_open(read_data=eval_config.model_dump_json())
+  with mock.patch("builtins.open", mock_open):
+    assert get_evaluation_criteria_or_default("dummy_path") == eval_config
+
+
+def test_get_eval_metrics_from_config():
+  rubric_1 = Rubric(
+      rubric_id="test-rubric",
+      rubric_content=RubricContent(text_property="test"),
+  )
+  eval_config = EvalConfig(
+      criteria={
+          "tool_trajectory_avg_score": 1.0,
+          "response_match_score": 0.8,
+          "final_response_match_v2": {
+              "threshold": 0.5,
+              "judge_model_options": {
+                  "judge_model": "gemini-pro",
+                  "num_samples": 1,
+              },
+          },
+          "rubric_based_final_response_quality_v1": {
+              "threshold": 0.9,
+              "judge_model_options": {
+                  "judge_model": "gemini-ultra",
+                  "num_samples": 1,
+              },
+              "rubrics": [rubric_1],
+          },
+      }
+  )
+  eval_metrics = get_eval_metrics_from_config(eval_config)
+
+  assert len(eval_metrics) == 4
+  assert eval_metrics[0].metric_name == "tool_trajectory_avg_score"
+  assert eval_metrics[0].threshold == 1.0
+  assert eval_metrics[0].criterion.threshold == 1.0
+  assert eval_metrics[1].metric_name == "response_match_score"
+  assert eval_metrics[1].threshold == 0.8
+  assert eval_metrics[1].criterion.threshold == 0.8
+  assert eval_metrics[2].metric_name == "final_response_match_v2"
+  assert eval_metrics[2].threshold == 0.5
+  assert eval_metrics[2].criterion.threshold == 0.5
+  assert (
+      eval_metrics[2].criterion.judge_model_options["judge_model"]
+      == "gemini-pro"
+  )
+  assert eval_metrics[3].metric_name == "rubric_based_final_response_quality_v1"
+  assert eval_metrics[3].threshold == 0.9
+  assert eval_metrics[3].criterion.threshold == 0.9
+  assert (
+      eval_metrics[3].criterion.judge_model_options["judge_model"]
+      == "gemini-ultra"
+  )
+  assert len(eval_metrics[3].criterion.rubrics) == 1
+  assert eval_metrics[3].criterion.rubrics[0] == rubric_1
+
+
+def test_get_eval_metrics_from_config_empty_criteria():
+  eval_config = EvalConfig(criteria={})
+  eval_metrics = get_eval_metrics_from_config(eval_config)
+  assert not eval_metrics
@@ -840,6 +840,7 @@ def test_run_eval(test_app, create_test_eval_set):
            "threshold": 0.5,
            "score": 1.0,
            "evalStatus": 1,
+            "details": {},
        }],
    }
    for k, v in expected_eval_case_result.items():
@@ -15,6 +15,7 @@
 from __future__ import annotations

 from google.adk.evaluation.eval_case import Invocation
+from google.adk.evaluation.eval_metrics import BaseCriterion
 from google.adk.evaluation.eval_metrics import EvalMetric
 from google.adk.evaluation.eval_metrics import JudgeModelOptions
 from google.adk.evaluation.eval_metrics import PrebuiltMetrics
@@ -130,9 +131,8 @@ def _create_test_evaluator_gemini(
      EvalMetric(
          metric_name="final_response_match_v2",
          threshold=threshold,
-          judge_model_options=JudgeModelOptions(
-              judge_model="gemini-2.5-flash",
-              num_samples=3,
+          criterion=BaseCriterion(
+              threshold=0.5,
          ),
      ),
  )
@@ -20,6 +20,7 @@ from unittest.mock import MagicMock
 from google.adk.evaluation.eval_case import Invocation
 from google.adk.evaluation.eval_metrics import EvalMetric
 from google.adk.evaluation.eval_metrics import JudgeModelOptions
+from google.adk.evaluation.eval_metrics import LlmAsAJudgeCriterion
 from google.adk.evaluation.evaluator import EvalStatus
 from google.adk.evaluation.evaluator import EvaluationResult
 from google.adk.evaluation.evaluator import PerInvocationResult
@@ -60,15 +61,19 @@ class MockLlmAsJudge(LlmAsJudge):
@pytest.fixture
 def mock_llm_as_judge():
  return MockLlmAsJudge(
-      EvalMetric(
+      eval_metric=EvalMetric(
          metric_name="test_metric",
          threshold=0.5,
-          judge_model_options=JudgeModelOptions(
-              judge_model="gemini-2.5-flash",
-              judge_model_config=genai_types.GenerateContentConfig(),
-              num_samples=3,
+          criterion=LlmAsAJudgeCriterion(
+              threshold=0.5,
+              judge_model_options=JudgeModelOptions(
+                  judge_model="gemini-2.5-flash",
+                  judge_model_config=genai_types.GenerateContentConfig(),
+                  num_samples=3,
+              ),
          ),
      ),
+      criterion_type=LlmAsAJudgeCriterion,
  )


@@ -94,10 +99,11 @@ def test_get_eval_status():
  assert get_eval_status(score=None, threshold=0.8) == EvalStatus.NOT_EVALUATED


-def test_llm_as_judge_init_missing_judge_model_options():
+def test_llm_as_judge_init_missing_criterion():
  with pytest.raises(ValueError):
    MockLlmAsJudge(
        EvalMetric(metric_name="test_metric", threshold=0.8),
+        criterion_type=LlmAsAJudgeCriterion,
    )


@@ -107,10 +113,16 @@ def test_llm_as_judge_init_unregistered_model():
        EvalMetric(
            metric_name="test_metric",
            threshold=0.8,
-            judge_model_options=JudgeModelOptions(
-                judge_model="unregistered_model",
+            criterion=LlmAsAJudgeCriterion(
+                threshold=0.5,
+                judge_model_options=JudgeModelOptions(
+                    judge_model="unregistered_model",
+                    judge_model_config=genai_types.GenerateContentConfig(),
+                    num_samples=3,
+                ),
            ),
        ),
+        criterion_type=LlmAsAJudgeCriterion,
    )