diff --git a/src/google/adk/cli/cli_eval.py b/src/google/adk/cli/cli_eval.py
index 7ccc8252..6914125d 100644
--- a/src/google/adk/cli/cli_eval.py
+++ b/src/google/adk/cli/cli_eval.py
@@ -37,6 +37,8 @@ from ..evaluation.base_eval_service import InferenceRequest
 from ..evaluation.base_eval_service import InferenceResult
 from ..evaluation.constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
 from ..evaluation.eval_case import EvalCase
+from ..evaluation.eval_config import BaseCriterion
+from ..evaluation.eval_config import EvalConfig
 from ..evaluation.eval_metrics import EvalMetric
 from ..evaluation.eval_metrics import EvalMetricResult
 from ..evaluation.eval_metrics import EvalMetricResultPerInvocation
@@ -64,6 +66,10 @@ DEFAULT_CRITERIA = {
     RESPONSE_MATCH_SCORE_KEY: 0.8,
 }
 
+_DEFAULT_EVAL_CONFIG = EvalConfig(
+    criteria={"tool_trajectory_avg_score": 1.0, "response_match_score": 0.8}
+)
+
 
 def _import_from_path(module_name, file_path):
   spec = importlib.util.spec_from_file_location(module_name, file_path)
@@ -81,27 +87,48 @@ def _get_agent_module(agent_module_file_path: str):
 
 def get_evaluation_criteria_or_default(
     eval_config_file_path: str,
-) -> dict[str, float]:
-  """Returns evaluation criteria from the config file, if present.
+) -> EvalConfig:
+  """Returns EvalConfig read from the config file, if present.
 
   Otherwise a default one is returned.
   """
   if eval_config_file_path:
     with open(eval_config_file_path, "r", encoding="utf-8") as f:
-      config_data = json.load(f)
+      content = f.read()
+      return EvalConfig.model_validate_json(content)
 
-    if "criteria" in config_data and isinstance(config_data["criteria"], dict):
-      evaluation_criteria = config_data["criteria"]
-    else:
-      raise ValueError(
-          f"Invalid format for test_config.json at {eval_config_file_path}."
-          " Expected a 'criteria' dictionary."
-      )
-  else:
-    logger.info("No config file supplied. Using default criteria.")
-    evaluation_criteria = DEFAULT_CRITERIA
+  logger.info("No config file supplied. Using default criteria.")
+  return _DEFAULT_EVAL_CONFIG
 
-  return evaluation_criteria
+
+def get_eval_metrics_from_config(eval_config: EvalConfig) -> list[EvalMetric]:
+  """Returns a list of EvalMetrics mapped from the EvalConfig."""
+  eval_metric_list = []
+  if eval_config.criteria:
+    for metric_name, criterion in eval_config.criteria.items():
+      if isinstance(criterion, float):
+        eval_metric_list.append(
+            EvalMetric(
+                metric_name=metric_name,
+                threshold=criterion,
+                criterion=BaseCriterion(threshold=criterion),
+            )
+        )
+      elif isinstance(criterion, BaseCriterion):
+        eval_metric_list.append(
+            EvalMetric(
+                metric_name=metric_name,
+                threshold=criterion.threshold,
+                criterion=criterion,
+            )
+        )
+      else:
+        raise ValueError(
+            f"Unexpected criterion type. {type(criterion).__name__} not"
+            " supported."
+        )
+
+  return eval_metric_list
 
 
 def get_root_agent(agent_module_file_path: str) -> Agent:
diff --git a/src/google/adk/cli/cli_tools_click.py b/src/google/adk/cli/cli_tools_click.py
index c45fdd37..019b3284 100644
--- a/src/google/adk/cli/cli_tools_click.py
+++ b/src/google/adk/cli/cli_tools_click.py
@@ -382,24 +382,16 @@ def cli_eval(
     from ..evaluation.local_eval_sets_manager import LocalEvalSetsManager
     from .cli_eval import _collect_eval_results
     from .cli_eval import _collect_inferences
+    from .cli_eval import get_eval_metrics_from_config
     from .cli_eval import get_evaluation_criteria_or_default
     from .cli_eval import get_root_agent
     from .cli_eval import parse_and_get_evals_to_run
   except ModuleNotFoundError as mnf:
     raise click.ClickException(MISSING_EVAL_DEPENDENCIES_MESSAGE) from mnf
 
-  evaluation_criteria = get_evaluation_criteria_or_default(config_file_path)
-  eval_metrics = []
-  for metric_name, threshold in evaluation_criteria.items():
-    eval_metrics.append(
-        EvalMetric(
-            metric_name=metric_name,
-            threshold=threshold,
-            judge_model_options=JudgeModelOptions(),
-        )
-    )
-
-  print(f"Using evaluation criteria: {evaluation_criteria}")
+  eval_config = get_evaluation_criteria_or_default(config_file_path)
+  print(f"Using evaluation criteria: {eval_config}")
+  eval_metrics = get_eval_metrics_from_config(eval_config)
 
   root_agent = get_root_agent(agent_module_file_path)
   app_name = os.path.basename(agent_module_file_path)
@@ -500,7 +492,9 @@ def cli_eval(
   except ModuleNotFoundError as mnf:
     raise click.ClickException(MISSING_EVAL_DEPENDENCIES_MESSAGE) from mnf
 
-  print("*********************************************************************")
+  click.echo(
+      "*********************************************************************"
+  )
   eval_run_summary = {}
 
   for eval_result in eval_results:
@@ -513,9 +507,9 @@ def cli_eval(
       eval_run_summary[eval_result.eval_set_id][0] += 1
     else:
       eval_run_summary[eval_result.eval_set_id][1] += 1
-  print("Eval Run Summary")
+  click.echo("Eval Run Summary")
   for eval_set_id, pass_fail_count in eval_run_summary.items():
-    print(
+    click.echo(
         f"{eval_set_id}:\n  Tests passed: {pass_fail_count[0]}\n  Tests"
         f" failed: {pass_fail_count[1]}"
     )
@@ -523,10 +517,17 @@ def cli_eval(
   if print_detailed_results:
     for eval_result in eval_results:
       eval_result: EvalCaseResult
-      print(
+      click.echo(
           "*********************************************************************"
       )
-      print(eval_result.model_dump_json(indent=2))
+      click.echo(
+          eval_result.model_dump_json(
+              indent=2,
+              exclude_unset=True,
+              exclude_defaults=True,
+              exclude_none=True,
+          )
+      )
 
 
 def adk_services_options():
@@ -1010,7 +1011,8 @@ def cli_deploy_cloud_run(
 
     adk deploy cloud_run --project=[project] --region=[region] path/to/my_agent
 
-    adk deploy cloud_run --project=[project] --region=[region] path/to/my_agent -- --no-allow-unauthenticated --min-instances=2
+    adk deploy cloud_run --project=[project] --region=[region] path/to/my_agent
+      -- --no-allow-unauthenticated --min-instances=2
   """
   if verbosity:
     click.secho(
@@ -1222,7 +1224,8 @@ def cli_deploy_agent_engine(
   Example:
 
     adk deploy agent_engine --project=[project] --region=[region]
-      --staging_bucket=[staging_bucket] --display_name=[app_name] path/to/my_agent
+      --staging_bucket=[staging_bucket] --display_name=[app_name]
+      path/to/my_agent
   """
   try:
     cli_deploy.to_agent_engine(
@@ -1367,7 +1370,8 @@ def cli_deploy_gke(
 
   Example:
 
-    adk deploy gke --project=[project] --region=[region] --cluster_name=[cluster_name] path/to/my_agent
+    adk deploy gke --project=[project] --region=[region]
+      --cluster_name=[cluster_name] path/to/my_agent
   """
   try:
     cli_deploy.to_gke(
diff --git a/src/google/adk/evaluation/common.py b/src/google/adk/evaluation/common.py
new file mode 100644
index 00000000..3f349d57
--- /dev/null
+++ b/src/google/adk/evaluation/common.py
@@ -0,0 +1,26 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import pydantic
+from pydantic import alias_generators
+
+
+class EvalBaseModel(pydantic.BaseModel):
+  model_config = pydantic.ConfigDict(
+      alias_generator=alias_generators.to_camel,
+      populate_by_name=True,
+      extra='forbid',
+  )
diff --git a/src/google/adk/evaluation/eval_config.py b/src/google/adk/evaluation/eval_config.py
new file mode 100644
index 00000000..cc2de90e
--- /dev/null
+++ b/src/google/adk/evaluation/eval_config.py
@@ -0,0 +1,66 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from typing import Union
+
+from pydantic import alias_generators
+from pydantic import BaseModel
+from pydantic import ConfigDict
+from pydantic import Field
+
+from .eval_metrics import BaseCriterion
+from .eval_metrics import Threshold
+
+
+class EvalConfig(BaseModel):
+  """Configurations needed to run an Eval.
+
+  Allows users to specify metrics, their thresholds and other properties.
+  """
+
+  model_config = ConfigDict(
+      alias_generator=alias_generators.to_camel,
+      populate_by_name=True,
+  )
+
+  criteria: dict[str, Union[Threshold, BaseCriterion]] = Field(
+      default_factory=dict,
+      description="""A dictionary that maps criterion to be used for a metric.
+
+The key of the dictionary is the name of the eval metric and the value is the
+criterion to be used.
+
+In the sample below, `tool_trajectory_avg_score`, `response_match_score` and
+`final_response_match_v2` are the standard eval metric names, represented as
+keys in the dictionary. The values in the dictionary are the corresponding
+criterions. For the first two metrics, we use simple threshold as the criterion,
+the third one uses `LlmAsAJudgeCriterion`.
+{
+  "criteria": {
+    "tool_trajectory_avg_score": 1.0,
+    "response_match_score": 0.5,
+    "final_response_match_v2": {
+      "threshold": 0.5,
+      "judge_model_options": {
+            "judge_model": "my favorite LLM",
+            "num_samples": 5
+          }
+        }
+    },
+  }
+}
+""",
+  )
diff --git a/src/google/adk/evaluation/eval_metrics.py b/src/google/adk/evaluation/eval_metrics.py
index d73ce1e6..66f7299f 100644
--- a/src/google/adk/evaluation/eval_metrics.py
+++ b/src/google/adk/evaluation/eval_metrics.py
@@ -25,8 +25,16 @@ from pydantic import ConfigDict
 from pydantic import Field
 from typing_extensions import TypeAlias
 
+from .common import EvalBaseModel
 from .eval_case import Invocation
-from .evaluator import EvalStatus
+from .eval_rubrics import Rubric
+from .eval_rubrics import RubricScore
+
+
+class EvalStatus(Enum):
+  PASSED = 1
+  FAILED = 2
+  NOT_EVALUATED = 3
 
 
 class PrebuiltMetrics(Enum):
@@ -42,9 +50,10 @@ class PrebuiltMetrics(Enum):
 
 
 MetricName: TypeAlias = Union[str, PrebuiltMetrics]
+Threshold: TypeAlias = float
 
 
-class JudgeModelOptions(BaseModel):
+class JudgeModelOptions(EvalBaseModel):
   """Options for an eval metric's judge model."""
 
   judge_model: str = Field(
@@ -55,27 +64,69 @@ class JudgeModelOptions(BaseModel):
   )
 
   judge_model_config: Optional[genai_types.GenerateContentConfig] = Field(
-      default=None,
+      default=genai_types.GenerateContentConfig,
       description="The configuration for the judge model.",
   )
 
-  num_samples: Optional[int] = Field(
-      default=None,
+  num_samples: int = Field(
+      default=5,
       description=(
           "The number of times to sample the model for each invocation"
-          " evaluation."
+          " evaluation. Given that models tend to have certain degree of"
+          " unreliability to them, we repeatedly sample them with the same"
+          " data. These repeated invocation are them aggregated using some"
+          " strategy. From experimentation, we have found 5 to be a good"
+          " default."
       ),
   )
 
 
-class EvalMetric(BaseModel):
-  """A metric used to evaluate a particular aspect of an eval case."""
+class BaseCriterion(BaseModel):
+  """Base creterion to use for an Eval Metric."""
 
   model_config = ConfigDict(
       alias_generator=alias_generators.to_camel,
       populate_by_name=True,
+      extra="allow",
   )
 
+  threshold: Threshold = Field(
+      description="The threshold to be used by the metric.",
+  )
+
+
+class LlmAsAJudgeCriterion(BaseCriterion):
+  """Criterion when using LLM-As-A-Judge metric."""
+
+  judge_model_options: JudgeModelOptions = Field(
+      default_factory=JudgeModelOptions,
+      description="Options for the judge model.",
+  )
+
+
+class RubricsBasedCriterion(BaseCriterion):
+  """Criterion when using a rubric based metric."""
+
+  judge_model_options: JudgeModelOptions = Field(
+      default_factory=JudgeModelOptions,
+      description="Options for the judge model.",
+  )
+
+  rubrics: list[Rubric] = Field(
+      default_factory=list,
+      description=(
+          "Rubrics to be used by Metric. Not all metrics rely on rubrics, but"
+          " metrics like `rubric_based_final_response_quality_v1` do. Metrics"
+          " that don't use Rubrics, will just ignore this field, if specified."
+          " Metrics that do use rubrics will raise an execption, if they are"
+          " not specified."
+      ),
+  )
+
+
+class EvalMetric(EvalBaseModel):
+  """A metric used to evaluate a particular aspect of an eval case."""
+
   metric_name: str = Field(
       description="The name of the metric.",
   )
@@ -88,19 +139,33 @@ class EvalMetric(BaseModel):
   )
 
   judge_model_options: Optional[JudgeModelOptions] = Field(
+      deprecated=True,
       default=None,
-      description="Options for the judge model.",
+      description=(
+          "[DEPRECATED] This field is deprecated in favor of `criterion`."
+          " Depending on the metric you may want to one of the sub-classes of"
+          " BaseCriterion."
+      ),
+  )
+
+  criterion: Optional[BaseCriterion] = Field(
+      default=None, description="""Evaluation criterion used by the metric."""
+  )
+
+
+class EvalMetricResultDetails(EvalBaseModel):
+  rubric_scores: Optional[list[RubricScore]] = Field(
+      default=None,
+      description=(
+          "The scores obtained after applying the rubrics to the Agent's"
+          " response."
+      ),
   )
 
 
 class EvalMetricResult(EvalMetric):
   """The actual computed score/value of a particular EvalMetric."""
 
-  model_config = ConfigDict(
-      alias_generator=alias_generators.to_camel,
-      populate_by_name=True,
-  )
-
   score: Optional[float] = Field(
       default=None,
       description=(
@@ -108,17 +173,17 @@ class EvalMetricResult(EvalMetric):
           " might not have happened."
       ),
   )
+
   eval_status: EvalStatus = Field(description="The status of this evaluation.")
 
-
-class EvalMetricResultPerInvocation(BaseModel):
-  """Eval metric results per invocation."""
-
-  model_config = ConfigDict(
-      alias_generator=alias_generators.to_camel,
-      populate_by_name=True,
+  details: EvalMetricResultDetails = Field(
+      default_factory=EvalMetricResultDetails, description=""""""
   )
 
+
+class EvalMetricResultPerInvocation(EvalBaseModel):
+  """Eval metric results per invocation."""
+
   actual_invocation: Invocation = Field(
       description=(
           "The actual invocation, usually obtained by inferencing the agent."
@@ -137,7 +202,7 @@ class EvalMetricResultPerInvocation(BaseModel):
   )
 
 
-class Interval(BaseModel):
+class Interval(EvalBaseModel):
   """Represents a range of numeric values, e.g. [0 ,1] or (2,3) or [-1, 6)."""
 
   min_value: float = Field(description="The smaller end of the interval.")
@@ -161,7 +226,7 @@ class Interval(BaseModel):
   )
 
 
-class MetricValueInfo(BaseModel):
+class MetricValueInfo(EvalBaseModel):
   """Information about the type of metric value."""
 
   interval: Optional[Interval] = Field(
@@ -170,14 +235,9 @@ class MetricValueInfo(BaseModel):
   )
 
 
-class MetricInfo(BaseModel):
+class MetricInfo(EvalBaseModel):
   """Information about the metric that are used for Evals."""
 
-  model_config = ConfigDict(
-      alias_generator=alias_generators.to_camel,
-      populate_by_name=True,
-  )
-
   metric_name: str = Field(description="The name of the metric.")
 
   description: str = Field(
diff --git a/src/google/adk/evaluation/eval_rubrics.py b/src/google/adk/evaluation/eval_rubrics.py
new file mode 100644
index 00000000..8dd2f6ca
--- /dev/null
+++ b/src/google/adk/evaluation/eval_rubrics.py
@@ -0,0 +1,82 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from typing import Optional
+
+from pydantic import Field
+
+from .common import EvalBaseModel
+
+
+class RubricContent(EvalBaseModel):
+  """The content of a rubric."""
+
+  text_property: Optional[str] = Field(
+      description=(
+          "The property being evaluated. Example: \"The agent's response is"
+          ' grammatically correct." '
+      )
+  )
+
+
+class Rubric(EvalBaseModel):
+  """This class represents a single Rubric."""
+
+  rubric_id: str = Field(
+      description="Unique identifier for the rubric.",
+  )
+
+  rubric_content: RubricContent = Field(
+      description="The actual testable criterion for the rubric."
+  )
+
+  description: Optional[str] = Field(
+      default=None,
+      description=(
+          "A description of the rubric that provide details on how the results"
+          " of the rubric assessment be interpreted."
+      ),
+  )
+
+  type: Optional[str] = Field(
+      default=None,
+      description="""Optional. A type designator for the rubric, which can
+      inform how it's evaluated or interpreted by systems or users.
+
+      It's recommended to use consistent, well-defined, upper snake_case
+      strings.
+
+      Examples: "TOOL_USE_QUALITY", "FINAL_RESPONSE_QUALITY",
+      "INSTRUCTION_ADHERENCE".""",
+  )
+
+
+class RubricScore(EvalBaseModel):
+  """The score obtained after applying the rubric to the Agent's response."""
+
+  rubric_id: str = Field(description="The id of the rubric that was assessed.")
+
+  rationale: Optional[str] = Field(
+      default=None, description="Reasoning/rationale for the score."
+  )
+
+  score: Optional[float] = Field(
+      default=None,
+      description=(
+          "Score obtained after assessing the rubric. Optional, as assessment"
+          " might not have happened."
+      ),
+  )
diff --git a/src/google/adk/evaluation/evaluator.py b/src/google/adk/evaluation/evaluator.py
index bc19313d..07ee9584 100644
--- a/src/google/adk/evaluation/evaluator.py
+++ b/src/google/adk/evaluation/evaluator.py
@@ -11,20 +11,21 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 
 from abc import ABC
-from enum import Enum
+from typing import ClassVar
 from typing import Optional
 
 from pydantic import BaseModel
+from typing_extensions import TypeAlias
 
 from .eval_case import Invocation
+from .eval_metrics import BaseCriterion
+from .eval_metrics import EvalStatus
 
-
-class EvalStatus(Enum):
-  PASSED = 1
-  FAILED = 2
-  NOT_EVALUATED = 3
+# Redefining the type here for backward compatibility.
+EvalStatus: TypeAlias = EvalStatus
 
 
 class PerInvocationResult(BaseModel):
@@ -49,6 +50,8 @@ class EvaluationResult(BaseModel):
 class Evaluator(ABC):
   """A merics evaluator interface."""
 
+  criterion_type: ClassVar[type[BaseCriterion]] = BaseCriterion
+
   def evaluate_invocations(
       self,
       actual_invocations: list[Invocation],
diff --git a/src/google/adk/evaluation/final_response_match_v2.py b/src/google/adk/evaluation/final_response_match_v2.py
index 177e719a..827f397b 100644
--- a/src/google/adk/evaluation/final_response_match_v2.py
+++ b/src/google/adk/evaluation/final_response_match_v2.py
@@ -16,6 +16,7 @@ from __future__ import annotations
 
 import logging
 import re
+from typing import ClassVar
 from typing import Optional
 
 from typing_extensions import override
@@ -24,11 +25,12 @@ from ..models.llm_response import LlmResponse
 from ..utils.feature_decorator import experimental
 from .eval_case import Invocation
 from .eval_metrics import EvalMetric
+from .eval_metrics import EvalStatus
 from .eval_metrics import Interval
+from .eval_metrics import LlmAsAJudgeCriterion
 from .eval_metrics import MetricInfo
 from .eval_metrics import MetricValueInfo
 from .eval_metrics import PrebuiltMetrics
-from .evaluator import EvalStatus
 from .evaluator import EvaluationResult
 from .evaluator import PerInvocationResult
 from .llm_as_judge import LlmAsJudge
@@ -79,8 +81,6 @@ The answer should be a json alone which follows the json structure below:
 Answer with assertiveness:
 """
 
-_DEFAULT_NUM_SAMPLES = 5
-
 
 def _parse_critique(response: str) -> Label:
   """Parses the judge model critique and extracts the final label.
@@ -140,15 +140,14 @@ class FinalResponseMatchV2Evaluator(LlmAsJudge):
   score indicate better final response performance of the agent.
   """
 
+  criterion_type: ClassVar[type[LlmAsAJudgeCriterion]] = LlmAsAJudgeCriterion
+
   def __init__(
       self,
       eval_metric: EvalMetric,
   ):
-    super().__init__(eval_metric)
+    super().__init__(eval_metric, FinalResponseMatchV2Evaluator.criterion_type)
     self._auto_rater_prompt_template = _FINAL_RESPONSE_MATCH_V2_PROMPT
-    assert self._eval_metric.judge_model_options is not None
-    if self._eval_metric.judge_model_options.num_samples is None:
-      self._eval_metric.judge_model_options.num_samples = _DEFAULT_NUM_SAMPLES
 
   @staticmethod
   def get_metric_info() -> MetricInfo:
@@ -241,7 +240,7 @@ class FinalResponseMatchV2Evaluator(LlmAsJudge):
     return EvaluationResult(
         overall_score=overall_score,
         overall_eval_status=get_eval_status(
-            overall_score, self._eval_metric.threshold
+            overall_score, self._criterion.threshold
         ),
         per_invocation_results=per_invocation_results,
     )
diff --git a/src/google/adk/evaluation/llm_as_judge.py b/src/google/adk/evaluation/llm_as_judge.py
index b17ee82d..cf86ffbb 100644
--- a/src/google/adk/evaluation/llm_as_judge.py
+++ b/src/google/adk/evaluation/llm_as_judge.py
@@ -18,6 +18,7 @@ from abc import abstractmethod
 from typing import Optional
 
 from google.genai import types as genai_types
+from pydantic import ValidationError
 from typing_extensions import override
 
 from ..models.base_llm import BaseLlm
@@ -26,6 +27,7 @@ from ..models.llm_response import LlmResponse
 from ..models.registry import LLMRegistry
 from ..utils.context_utils import Aclosing
 from .eval_case import Invocation
+from .eval_metrics import BaseCriterion
 from .eval_metrics import EvalMetric
 from .evaluator import EvaluationResult
 from .evaluator import Evaluator
@@ -49,17 +51,26 @@ class LlmAsJudge(Evaluator):
   """
 
   def __init__(
-      self,
-      eval_metric: EvalMetric,
+      self, eval_metric: EvalMetric, criterion_type: type[BaseCriterion]
   ):
     self._eval_metric = eval_metric
-    if not eval_metric.judge_model_options:
-      raise ValueError("Judge model options is required for LlmAsJudge.")
-    self._judge_model_options = eval_metric.judge_model_options
-    if self._judge_model_options.judge_model_config is None:
-      self._judge_model_options.judge_model_config = (
-          genai_types.GenerateContentConfig()
+
+    expected_criterion_type_error = ValueError(
+        f"`{eval_metric.metric_name}` metric expects a criterion of type"
+        f" `{criterion_type}`."
+    )
+
+    try:
+      if self._eval_metric.criterion is None:
+        raise expected_criterion_type_error
+
+      self._criterion = criterion_type.model_validate(
+          self._eval_metric.criterion.model_dump()
       )
+    except ValidationError as e:
+      raise expected_criterion_type_error from e
+
+    self._judge_model_options = self._criterion.judge_model_options
     self._judge_model = self._setup_auto_rater()
 
   @abstractmethod
@@ -122,7 +133,7 @@ class LlmAsJudge(Evaluator):
                     expected_invocation=expected,
                     score=score,
                     eval_status=get_eval_status(
-                        score, self._eval_metric.threshold
+                        score, self._criterion.threshold
                     ),
                 )
             )
diff --git a/src/google/adk/evaluation/metric_evaluator_registry.py b/src/google/adk/evaluation/metric_evaluator_registry.py
index e5fd33f4..e2bcd5f8 100644
--- a/src/google/adk/evaluation/metric_evaluator_registry.py
+++ b/src/google/adk/evaluation/metric_evaluator_registry.py
@@ -20,7 +20,6 @@ from ..errors.not_found_error import NotFoundError
 from ..utils.feature_decorator import experimental
 from .eval_metrics import EvalMetric
 from .eval_metrics import MetricInfo
-from .eval_metrics import MetricName
 from .eval_metrics import PrebuiltMetrics
 from .evaluator import Evaluator
 from .final_response_match_v2 import FinalResponseMatchV2Evaluator
diff --git a/tests/unittests/cli/test_cli_eval.py b/tests/unittests/cli/test_cli_eval.py
new file mode 100644
index 00000000..2b284ded
--- /dev/null
+++ b/tests/unittests/cli/test_cli_eval.py
@@ -0,0 +1,96 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from unittest import mock
+
+from google.adk.cli.cli_eval import _DEFAULT_EVAL_CONFIG
+from google.adk.cli.cli_eval import get_eval_metrics_from_config
+from google.adk.cli.cli_eval import get_evaluation_criteria_or_default
+from google.adk.evaluation.eval_config import EvalConfig
+from google.adk.evaluation.eval_rubrics import Rubric
+from google.adk.evaluation.eval_rubrics import RubricContent
+
+
+def test_get_evaluation_criteria_or_default_returns_default():
+  assert get_evaluation_criteria_or_default("") == _DEFAULT_EVAL_CONFIG
+
+
+def test_get_evaluation_criteria_or_default_reads_from_file():
+  eval_config = EvalConfig(
+      criteria={"tool_trajectory_avg_score": 0.5, "response_match_score": 0.5}
+  )
+  mock_open = mock.mock_open(read_data=eval_config.model_dump_json())
+  with mock.patch("builtins.open", mock_open):
+    assert get_evaluation_criteria_or_default("dummy_path") == eval_config
+
+
+def test_get_eval_metrics_from_config():
+  rubric_1 = Rubric(
+      rubric_id="test-rubric",
+      rubric_content=RubricContent(text_property="test"),
+  )
+  eval_config = EvalConfig(
+      criteria={
+          "tool_trajectory_avg_score": 1.0,
+          "response_match_score": 0.8,
+          "final_response_match_v2": {
+              "threshold": 0.5,
+              "judge_model_options": {
+                  "judge_model": "gemini-pro",
+                  "num_samples": 1,
+              },
+          },
+          "rubric_based_final_response_quality_v1": {
+              "threshold": 0.9,
+              "judge_model_options": {
+                  "judge_model": "gemini-ultra",
+                  "num_samples": 1,
+              },
+              "rubrics": [rubric_1],
+          },
+      }
+  )
+  eval_metrics = get_eval_metrics_from_config(eval_config)
+
+  assert len(eval_metrics) == 4
+  assert eval_metrics[0].metric_name == "tool_trajectory_avg_score"
+  assert eval_metrics[0].threshold == 1.0
+  assert eval_metrics[0].criterion.threshold == 1.0
+  assert eval_metrics[1].metric_name == "response_match_score"
+  assert eval_metrics[1].threshold == 0.8
+  assert eval_metrics[1].criterion.threshold == 0.8
+  assert eval_metrics[2].metric_name == "final_response_match_v2"
+  assert eval_metrics[2].threshold == 0.5
+  assert eval_metrics[2].criterion.threshold == 0.5
+  assert (
+      eval_metrics[2].criterion.judge_model_options["judge_model"]
+      == "gemini-pro"
+  )
+  assert eval_metrics[3].metric_name == "rubric_based_final_response_quality_v1"
+  assert eval_metrics[3].threshold == 0.9
+  assert eval_metrics[3].criterion.threshold == 0.9
+  assert (
+      eval_metrics[3].criterion.judge_model_options["judge_model"]
+      == "gemini-ultra"
+  )
+  assert len(eval_metrics[3].criterion.rubrics) == 1
+  assert eval_metrics[3].criterion.rubrics[0] == rubric_1
+
+
+def test_get_eval_metrics_from_config_empty_criteria():
+  eval_config = EvalConfig(criteria={})
+  eval_metrics = get_eval_metrics_from_config(eval_config)
+  assert not eval_metrics
diff --git a/tests/unittests/cli/test_fast_api.py b/tests/unittests/cli/test_fast_api.py
index d0d84cac..d1e8dcab 100755
--- a/tests/unittests/cli/test_fast_api.py
+++ b/tests/unittests/cli/test_fast_api.py
@@ -840,6 +840,7 @@ def test_run_eval(test_app, create_test_eval_set):
             "threshold": 0.5,
             "score": 1.0,
             "evalStatus": 1,
+            "details": {},
         }],
     }
     for k, v in expected_eval_case_result.items():
diff --git a/tests/unittests/evaluation/test_final_response_match_v2.py b/tests/unittests/evaluation/test_final_response_match_v2.py
index 911c5e22..382b7a7d 100644
--- a/tests/unittests/evaluation/test_final_response_match_v2.py
+++ b/tests/unittests/evaluation/test_final_response_match_v2.py
@@ -15,6 +15,7 @@
 from __future__ import annotations
 
 from google.adk.evaluation.eval_case import Invocation
+from google.adk.evaluation.eval_metrics import BaseCriterion
 from google.adk.evaluation.eval_metrics import EvalMetric
 from google.adk.evaluation.eval_metrics import JudgeModelOptions
 from google.adk.evaluation.eval_metrics import PrebuiltMetrics
@@ -130,9 +131,8 @@ def _create_test_evaluator_gemini(
       EvalMetric(
           metric_name="final_response_match_v2",
           threshold=threshold,
-          judge_model_options=JudgeModelOptions(
-              judge_model="gemini-2.5-flash",
-              num_samples=3,
+          criterion=BaseCriterion(
+              threshold=0.5,
           ),
       ),
   )
diff --git a/tests/unittests/evaluation/test_llm_as_judge.py b/tests/unittests/evaluation/test_llm_as_judge.py
index 2a4ba13c..d03d88b2 100644
--- a/tests/unittests/evaluation/test_llm_as_judge.py
+++ b/tests/unittests/evaluation/test_llm_as_judge.py
@@ -20,6 +20,7 @@ from unittest.mock import MagicMock
 from google.adk.evaluation.eval_case import Invocation
 from google.adk.evaluation.eval_metrics import EvalMetric
 from google.adk.evaluation.eval_metrics import JudgeModelOptions
+from google.adk.evaluation.eval_metrics import LlmAsAJudgeCriterion
 from google.adk.evaluation.evaluator import EvalStatus
 from google.adk.evaluation.evaluator import EvaluationResult
 from google.adk.evaluation.evaluator import PerInvocationResult
@@ -60,15 +61,19 @@ class MockLlmAsJudge(LlmAsJudge):
 @pytest.fixture
 def mock_llm_as_judge():
   return MockLlmAsJudge(
-      EvalMetric(
+      eval_metric=EvalMetric(
           metric_name="test_metric",
           threshold=0.5,
-          judge_model_options=JudgeModelOptions(
-              judge_model="gemini-2.5-flash",
-              judge_model_config=genai_types.GenerateContentConfig(),
-              num_samples=3,
+          criterion=LlmAsAJudgeCriterion(
+              threshold=0.5,
+              judge_model_options=JudgeModelOptions(
+                  judge_model="gemini-2.5-flash",
+                  judge_model_config=genai_types.GenerateContentConfig(),
+                  num_samples=3,
+              ),
           ),
       ),
+      criterion_type=LlmAsAJudgeCriterion,
   )
 
 
@@ -94,10 +99,11 @@ def test_get_eval_status():
   assert get_eval_status(score=None, threshold=0.8) == EvalStatus.NOT_EVALUATED
 
 
-def test_llm_as_judge_init_missing_judge_model_options():
+def test_llm_as_judge_init_missing_criterion():
   with pytest.raises(ValueError):
     MockLlmAsJudge(
         EvalMetric(metric_name="test_metric", threshold=0.8),
+        criterion_type=LlmAsAJudgeCriterion,
     )
 
 
@@ -107,10 +113,16 @@ def test_llm_as_judge_init_unregistered_model():
         EvalMetric(
             metric_name="test_metric",
             threshold=0.8,
-            judge_model_options=JudgeModelOptions(
-                judge_model="unregistered_model",
+            criterion=LlmAsAJudgeCriterion(
+                threshold=0.5,
+                judge_model_options=JudgeModelOptions(
+                    judge_model="unregistered_model",
+                    judge_model_config=genai_types.GenerateContentConfig(),
+                    num_samples=3,
+                ),
             ),
         ),
+        criterion_type=LlmAsAJudgeCriterion,
     )