feat: Implement auto rater-based evaluator for responses

PiperOrigin-RevId: 780654576
2026-03-30 10:57:20 -07:00 · 2025-07-08 11:48:49 -07:00
parent 45d60a1906
commit 75699fbeca
7 changed files with 1146 additions and 3 deletions
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from __future__ import annotations

 from typing import Any
 from typing import Optional
-from typing import Tuple

 from google.genai import types as genai_types
 from pydantic import alias_generators
@@ -37,11 +37,11 @@ class IntermediateData(EvalBaseModel):
  tool_uses: list[genai_types.FunctionCall] = []
  """Tool use trajectory in chronological order."""

-  intermediate_responses: list[Tuple[str, list[genai_types.Part]]] = []
+  intermediate_responses: list[tuple[str, list[genai_types.Part]]] = []
  """Intermediate responses generated by sub-agents to convey progress or status
  in a multi-agent system, distinct from the final response.

-  This is expressed as a Tuple of:
+  This is expressed as a tuple of:
    - Author: Usually the sub-agent name that generated the intermediate
      response.

@@ -18,9 +18,11 @@ from enum import Enum
 from typing import Optional
 from typing import Union

+from google.genai import types as genai_types
 from pydantic import alias_generators
 from pydantic import BaseModel
 from pydantic import ConfigDict
+from pydantic import Field
 from typing_extensions import TypeAlias

 from .eval_case import Invocation
@@ -38,6 +40,24 @@ class PrebuiltMetrics(Enum):
 MetricName: TypeAlias = Union[str, PrebuiltMetrics]


+class JudgeModelOptions(BaseModel):
+  """Options for an eval metric's judge model."""
+
+  judge_model: str = Field(
+      default="gemini-2.5-flash",
+      description="""The judge model to use for evaluation. It can be a model name.""",
+  )
+
+  judge_model_config: Optional[genai_types.GenerateContentConfig] = Field(
+      default=None, description="""The configuration for the judge model."""
+  )
+
+  num_samples: Optional[int] = Field(
+      default=None,
+      description="""The number of times to sample the model for each invocation evaluation.""",
+  )
+
+
 class EvalMetric(BaseModel):
  """A metric used to evaluate a particular aspect of an eval case."""

@@ -52,6 +72,11 @@ class EvalMetric(BaseModel):
  threshold: float
  """A threshold value. Each metric decides how to interpret this threshold."""

+  judge_model_options: Optional[JudgeModelOptions] = Field(
+      default=None,
+      description="""Options for the judge model.""",
+  )
+

 class EvalMetricResult(EvalMetric):
  """The actual computed score/value of a particular EvalMetric."""
@@ -0,0 +1,230 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import logging
+import re
+from typing import Optional
+
+from typing_extensions import override
+
+from ..models.llm_response import LlmResponse
+from ..utils.feature_decorator import working_in_progress
+from .eval_case import Invocation
+from .eval_metrics import EvalMetric
+from .evaluator import EvalStatus
+from .evaluator import EvaluationResult
+from .evaluator import PerInvocationResult
+from .llm_as_judge import LlmAsJudge
+from .llm_as_judge_utils import get_eval_status
+from .llm_as_judge_utils import get_text_from_content
+from .llm_as_judge_utils import Label
+
+logger = logging.getLogger("google_adk." + __name__)
+
+_FINAL_RESPONSE_MATCH_V2_PROMPT = """You are an expert rater for an AI agent. The AI agent is going to call an API to answer the user query and generate API tool use code based for the choice of the API and API arguments. The ideal model response should be a function call that fulfills user query, or a natural language response hedges or asks users for further clarification if a function call does not apply.
+The primary focus of this rating task is to check correctness of the model responses.
+
+The data consists of:
+- A user query.
+- A model generated response for the prompt. The responses can consist of:
+  - Natural language, when the model is asking for clarification, or tells the user it does not possess the requested functionality / option.
+  - Code, in the form of one or multiple python function calls, and additional code as needed, for when the model is fulfilling the user request.
+You can use the help from a reference response annotated by a human rater. This reference response is of high quality. You can compare the agent's response with the reference response and decide if the agent's response is valid.
+Note sometimes the reference response only contains the key entities of the correct answer and you need to be flexible to allow the agent response to contain more information than the reference response, or to present the key entities in a different format or structure or in shorter or longer format.
+When the agent response is provided in the form of tables/dataframes or should be best provided in the form of tables/dataframes: focus on the key entities and main components requested in the user query and check whether you can retrieve those from the agent response. Likewise, if you have the reference response, then find out the key entities and main components in them and check whether you can retrieve those from the agent response. If the prompt does not specify any format instructions and the main items/components are included in the response then tolerate the differences in the formatting of those tables/dataframes.
+
+You should follow the constitutions below very carefully to rate the model response:
+- Allow flexibility of format even when reference code only uses one of the possible format, unless API spec or user prompt has explicit format requirement
+  - e.g. For state name, allow both abbreviation and full name unless API spec has explicit requirement. e.g. both 'tx' and 'Texas' should be allowed in the agent response even when reference code only uses one of them.
+  - e.g. If a reference response list outputs in a list format, the agent response is allowed to use sentence format and vice versa unless user prompt explicitly asks for a specific format.
+  - e.g. For numbers, allow flexibility of formatting, e.g. 1000000 vs 1,000,000.
+- The model shouldn't assume that it doesn't have access to according data or incapable of answering the question if reference response is able to find a legit answer.
+- If the model response contains the correct final answer, rate it as valid even when the model response contains more information than the reference response.
+- If the user prompt has csv or other table format data, don't read it yourself. Trust the reference response final answer instead.
+- When the validation needs maths, date calculations, do not use your own calculator. Trust the reference response final answer instead.
+- Be mindful about unit of numbers. For example, if the reference response says 100 miles, but the model response says 100 km, it is invalid.
+- When the agent response or the reference response is provided in the form of tables/dataframes: focus on the key entities and main components requested in the user query and check whether you can retrieve those from the agent response and whether those match the reference response. If the user query does not specify any format instructions and the main items/components are included in the response then tolerate the differences in the formatting of those tables/dataframes.
+- When the answer is in numeric format, check whether there are any format requirements in the numeric format, rounding, precision, number of decimals, etc. specified in the user query and the prompt. If there are no such instructions, then tolerate different numerical formats.
+- When the answer is in numeric format and there are rounding or precision differences between the agent response and the reference response, if no further instructions are provided evaluate if the rounding strategy or precision in the agent response follows the standards for that entity. For instance, model accuracy scores must be reported with at least two decimal places (e.g., 0.798 → 0.80 is acceptable,  but 0.7 is not).
+
+Below are the inputs:
+{{
+  "User prompt": {prompt},
+  "Agent response": {response},
+  "Reference response": {golden_response},
+}}
+
+The answer should be a json alone which follows the json structure below:
+{{
+  "reasoning": [reasoning],
+  "is_the_agent_response_valid": [valid or invalid],
+}}
+Answer with assertiveness:
+"""
+
+_DEFAULT_NUM_SAMPLES = 5
+
+
+def _parse_critique(response: str) -> Label:
+  """Parses the judge model critique and extracts the final label.
+
+  Args:
+    response: model response
+
+  Returns:
+    The extracted label, either VALID, INVALID, or NOT_FOUND.
+  """
+  # Regex matching the label field in the response. The end of the field is
+  # identified by either a comma, new line, or an end-bracket.
+  label_match_is_response_valid = re.search(
+      r'"is_the_agent_response_valid":\s*\[*[\n\s]*"*([^"^\]^\s]*)"*[\n\s]*\]*\s*[,\n\}]',
+      response,
+  )
+  # In case the model names the label field as "is_the_agent_response_*invalid*"
+  # instead of "..._*valid*"
+  label_match_is_response_invalid = re.search(
+      r'"is_the_agent_response_invalid":\s*\[*[\n\s]*"*([^"^\]^\s]*)"*[\n\s]*\]*\s*[,\n\}]',
+      response,
+  )
+  # Remove any trailing whitespace, commas, or end-brackets from the label.
+  if label_match_is_response_valid:
+    label = label_match_is_response_valid.group(1).strip(r"\s,\}")
+    if label in [
+        Label.INVALID.value,
+        Label.ALMOST.value,
+        Label.FALSE.value,
+        *Label.PARTIALLY_VALID.value,
+    ]:
+      label = Label.INVALID
+    elif label in [Label.VALID.value, Label.TRUE.value]:
+      label = Label.VALID
+    else:
+      label = Label.NOT_FOUND
+  elif label_match_is_response_invalid:
+    label = label_match_is_response_invalid.group(1).strip(r"\s,\}")
+    label = (
+        Label.INVALID
+        if label in [Label.TRUE.value, Label.INVALID.value]
+        else Label.VALID
+    )
+  else:
+    label = Label.NOT_FOUND
+  return label
+
+
+@working_in_progress
+class FinalResponseMatchV2Evaluator(LlmAsJudge):
+  """V2 final response match evaluator which uses an LLM to judge responses.
+
+  The evaluator prompts the LLM to output whether the agent final response is
+  valid or invalid, hence outputs a score of 0 or 1. Repeated invocation samples
+  are aggregated by taking majority vote, and then the overall score is the
+  fraction, ranging from 0 to 1, of valid samples. Higher values of overall
+  score indicate better final response performance of the agent.
+  """
+
+  def __init__(
+      self,
+      eval_metric: EvalMetric,
+  ):
+    super().__init__(eval_metric)
+    self._auto_rater_prompt_template = _FINAL_RESPONSE_MATCH_V2_PROMPT
+    assert self._eval_metric.judge_model_options is not None
+    if self._eval_metric.judge_model_options.num_samples is None:
+      self._eval_metric.judge_model_options.num_samples = _DEFAULT_NUM_SAMPLES
+
+  @override
+  def format_auto_rater_prompt(
+      self, actual_invocation: Invocation, expected_invocation: Invocation
+  ) -> str:
+    reference = get_text_from_content(expected_invocation.final_response)
+    response = get_text_from_content(actual_invocation.final_response)
+    user_prompt = get_text_from_content(expected_invocation.user_content)
+    return self._auto_rater_prompt_template.format(
+        prompt=user_prompt,
+        response=response,
+        golden_response=reference,
+    )
+
+  @override
+  def convert_auto_rater_response_to_score(
+      self, llm_response: LlmResponse
+  ) -> Optional[float]:
+    response_text = get_text_from_content(llm_response.content)
+    if response_text is None:
+      return None
+    label = _parse_critique(response_text)
+    if label == Label.VALID:
+      return 1.0
+    elif label == Label.INVALID:
+      return 0.0
+    else:
+      return None
+
+  @override
+  def aggregate_per_invocation_samples(
+      self,
+      per_invocation_samples: list[PerInvocationResult],
+  ) -> PerInvocationResult:
+    """Aggregates samples of per-invocation results by taking majority vote.
+
+    Only consider results that were successfully evaluated. In the case of a
+    tie, consider the result to be invalid.
+
+    Args:
+      per_invocation_samples: Samples of per-invocation results to
+        aggregate.
+
+    Returns:
+      If there is a majority of valid results, return the first valid result.
+      Otherwise, return the first invalid result. If no results were
+      successfully evaluated, return the first sample.
+    """
+    positive_results = []
+    negative_results = []
+    for result in per_invocation_samples:
+      if result.score == 1.0:
+        positive_results.append(result)
+      elif result.score == 0.0:
+        negative_results.append(result)
+    # If no results were successfully evaluated, just return the first sample.
+    if not positive_results and not negative_results:
+      return per_invocation_samples[0]
+    elif len(positive_results) > len(negative_results):
+      return positive_results[0]
+    else:
+      return negative_results[0]
+
+  @override
+  def aggregate_invocation_results(
+      self, per_invocation_results: list[PerInvocationResult]
+  ) -> EvaluationResult:
+    """Computes the fraction of invocation results that are valid."""
+    num_valid = 0
+    num_evaluated = 0
+    for result in per_invocation_results:
+      if result.score is None or result.eval_status == EvalStatus.NOT_EVALUATED:
+        continue
+      num_evaluated += 1
+      num_valid += result.score
+    overall_score = num_valid / num_evaluated
+    return EvaluationResult(
+        overall_score=overall_score,
+        overall_eval_status=get_eval_status(
+            overall_score, self._eval_metric.threshold
+        ),
+        per_invocation_results=per_invocation_results,
+    )
@@ -0,0 +1,141 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from abc import abstractmethod
+from typing import Optional
+
+from google.genai import types as genai_types
+from typing_extensions import override
+
+from ..models.base_llm import BaseLlm
+from ..models.llm_request import LlmRequest
+from ..models.llm_response import LlmResponse
+from ..models.registry import LLMRegistry
+from .eval_case import Invocation
+from .eval_metrics import EvalMetric
+from .evaluator import EvaluationResult
+from .evaluator import Evaluator
+from .evaluator import PerInvocationResult
+from .llm_as_judge_utils import get_eval_status
+
+
+class LlmAsJudge(Evaluator):
+  """Evaluator based on a LLM.
+
+  It is meant to be extended by specific auto-raters for different evaluation
+  tasks:
+    - Provide the prompt template, and implement format_auto_rater_prompt to
+      format the auto-rater prompt for a given invocation.
+    - Implement convert_auto_rater_response_to_score to parse the auto-rater
+      response and return the corresponding score.
+    - Implement aggregate_invocation_results to aggregate the per-invocation
+      results to get the overall score.
+    - (Optional) Override aggregate_per_invocation_result_samples to aggregate
+      multiple auto-rater samples of the same invocation.
+  """
+
+  def __init__(
+      self,
+      eval_metric: EvalMetric,
+  ):
+    self._eval_metric = eval_metric
+    if not eval_metric.judge_model_options:
+      raise ValueError("Judge model options is required for LlmAsJudge.")
+    self._judge_model_options = eval_metric.judge_model_options
+    if self._judge_model_options.judge_model_config is None:
+      self._judge_model_options.judge_model_config = (
+          genai_types.GenerateContentConfig()
+      )
+    self._judge_model = self._setup_auto_rater()
+
+  @abstractmethod
+  def format_auto_rater_prompt(
+      self, actual: Invocation, expected: Invocation
+  ) -> str:
+    """Formats the auto-rater prompt to evaluate the given invocation."""
+
+  @abstractmethod
+  def convert_auto_rater_response_to_score(
+      self, auto_rater_response: LlmResponse
+  ) -> Optional[float]:
+    """Parses auto_rater_response and returns the corresponding score, or None if the score cannot be determined."""
+
+  @abstractmethod
+  def aggregate_per_invocation_samples(
+      self,
+      per_invocation_samples: list[PerInvocationResult],
+  ) -> PerInvocationResult:
+    """Aggregates repeated per-invocation samples to get the final result for the invocation."""
+
+  @abstractmethod
+  def aggregate_invocation_results(
+      self,
+      per_invocation_results: list[PerInvocationResult],
+  ) -> EvaluationResult:
+    """Aggregates the per invocation results to get the overall score."""
+
+  @override
+  async def evaluate_invocations(
+      self,
+      actual_invocations: list[Invocation],
+      expected_invocations: list[Invocation],
+  ) -> EvaluationResult:
+    per_invocation_results = []
+    for actual, expected in zip(actual_invocations, expected_invocations):
+      auto_rater_prompt = self.format_auto_rater_prompt(actual, expected)
+      llm_request = LlmRequest(
+          model=self._judge_model_options.judge_model,
+          contents=[
+              genai_types.Content(
+                  parts=[genai_types.Part(text=auto_rater_prompt)],
+                  role="user",
+              )
+          ],
+          config=self._judge_model_options.judge_model_config,
+      )
+      num_samples = self._judge_model_options.num_samples
+      invocation_result_samples = []
+      for _ in range(num_samples):
+        async for llm_response in self._judge_model.generate_content_async(
+            llm_request
+        ):
+          # Non-streaming call, so there is only one response content.
+          score = self.convert_auto_rater_response_to_score(llm_response)
+          invocation_result_samples.append(
+              PerInvocationResult(
+                  actual_invocation=actual,
+                  expected_invocation=expected,
+                  score=score,
+                  eval_status=get_eval_status(
+                      score, self._eval_metric.threshold
+                  ),
+              )
+          )
+      if not invocation_result_samples:
+        continue
+      per_invocation_results.append(
+          self.aggregate_per_invocation_samples(invocation_result_samples)
+      )
+
+    if per_invocation_results:
+      return self.aggregate_invocation_results(per_invocation_results)
+    return EvaluationResult()
+
+  def _setup_auto_rater(self) -> BaseLlm:
+    model_id = self._judge_model_options.judge_model
+    llm_registry = LLMRegistry()
+    llm_class = llm_registry.resolve(model_id)
+    return llm_class(model=model_id)
@@ -0,0 +1,48 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import enum
+from typing import Optional
+
+from google.genai import types as genai_types
+
+from .evaluator import EvalStatus
+
+
+@enum.unique
+class Label(enum.Enum):
+  """Labels for auto rater response."""
+
+  TRUE = "true"
+  INVALID = "invalid"
+  VALID = "valid"
+  PARTIALLY_VALID = "partially_valid", "partially valid", "partially"
+  ALMOST = "almost"
+  FALSE = "false"
+  NOT_FOUND = "label field not found"
+
+
+def get_text_from_content(
+    content: Optional[genai_types.Content],
+) -> Optional[str]:
+  if content and content.parts:
+    return "\n".join([p.text for p in content.parts if p.text])
+
+
+def get_eval_status(score: Optional[float], threshold: float) -> EvalStatus:
+  if score is None:
+    return EvalStatus.NOT_EVALUATED
+  return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED
@@ -0,0 +1,478 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from google.adk.evaluation.eval_case import Invocation
+from google.adk.evaluation.eval_metrics import EvalMetric
+from google.adk.evaluation.eval_metrics import JudgeModelOptions
+from google.adk.evaluation.evaluator import EvalStatus
+from google.adk.evaluation.evaluator import PerInvocationResult
+from google.adk.evaluation.final_response_match_v2 import _parse_critique
+from google.adk.evaluation.final_response_match_v2 import FinalResponseMatchV2Evaluator
+from google.adk.evaluation.llm_as_judge_utils import Label
+from google.adk.models.llm_response import LlmResponse
+from google.genai import types as genai_types
+import pytest
+
+
+@pytest.mark.parametrize(
+    "response_text",
+    [
+        """```json
+  {
+    "is_the_agent_response_valid_or_invalid": "valid",
+    "reasoning": "The response is valid."
+  }
+  ```""",
+        """```json
+  {
+    "is_the_agent_response_valid": "undefined label",
+  }
+  ```""",
+    ],
+)
+def test_parse_critique_label_not_found(response_text):
+  label = _parse_critique(response_text)
+  assert label == Label.NOT_FOUND
+
+
+@pytest.mark.parametrize(
+    "response_text",
+    [
+        """```json
+  {
+    "is_the_agent_response_valid": "valid",
+    "reasoning": "The response is valid."
+  }
+  ```""",
+        """```json
+  {
+    "is_the_agent_response_valid": ["valid"],
+    "reasoning": "The response is valid."
+  }
+  ```""",
+        """```json
+  {
+    "is_the_agent_response_valid":\n    [ "valid\n"],
+    "reasoning": "The response is valid."
+  }
+  ```""",
+    ],
+)
+def test_parse_critique(response_text):
+  label = _parse_critique(response_text)
+  assert label == Label.VALID
+
+
+@pytest.mark.parametrize(
+    "response_text",
+    [
+        """```json
+  {
+    "is_the_agent_response_invalid": "invalid",
+    "reasoning": "The response is invalid."
+  }
+  ```""",
+        """```json
+  {
+    "is_the_agent_response_invalid": ["invalid"],
+    "reasoning": "The response is invalid."
+  }
+  ```""",
+        """```json
+  {
+    "is_the_agent_response_invalid":\n    [ "invalid\n"],
+    "reasoning": "The response is invalid."
+  }
+  ```""",
+    ],
+)
+def test_parse_critique_invalid(response_text):
+  label = _parse_critique(response_text)
+  assert label == Label.INVALID
+
+
+def create_test_template() -> str:
+  return """
+This is a test template.
+
+{{
+  "User prompt": {prompt},
+  "Agent response": {response},
+  "Reference response": {golden_response},
+}}
+
+The answer should be a json alone which follows the json structure below:
+{{
+  "is_the_agent_response_valid": [valid or invalid],
+  "reasoning":
+}}
+"""
+
+
+def _create_test_evaluator_gemini(
+    threshold: float,
+) -> FinalResponseMatchV2Evaluator:
+  evaluator = FinalResponseMatchV2Evaluator(
+      EvalMetric(
+          metric_name="final_response_match_v2",
+          threshold=threshold,
+          judge_model_options=JudgeModelOptions(
+              judge_model="gemini-2.5-flash",
+              num_samples=3,
+          ),
+      ),
+  )
+  evaluator._auto_rater_prompt_template = create_test_template()
+  return evaluator
+
+
+def _create_test_invocations(
+    candidate: str, reference: str
+) -> tuple[Invocation, Invocation]:
+  """Returns tuple of (actual_invocation, expected_invocation)."""
+  actual_invocation = Invocation(
+      user_content=genai_types.Content(
+          parts=[genai_types.Part(text="This is a test query.")],
+          role="user",
+      ),
+      final_response=genai_types.Content(
+          parts=[genai_types.Part(text=candidate)],
+          role="model",
+      ),
+  )
+  expected_invocation = Invocation(
+      user_content=genai_types.Content(
+          parts=[genai_types.Part(text="This is a test query.")],
+          role="user",
+      ),
+      final_response=genai_types.Content(
+          parts=[genai_types.Part(text=reference)],
+          role="model",
+      ),
+  )
+  return actual_invocation, expected_invocation
+
+
+def test_format_auto_rater_prompt():
+  evaluator = _create_test_evaluator_gemini(threshold=0.8)
+  actual_invocation, expected_invocation = _create_test_invocations(
+      "candidate text", "reference text"
+  )
+  prompt = evaluator.format_auto_rater_prompt(
+      actual_invocation, expected_invocation
+  )
+  assert prompt == """
+This is a test template.
+
+{
+  "User prompt": This is a test query.,
+  "Agent response": candidate text,
+  "Reference response": reference text,
+}
+
+The answer should be a json alone which follows the json structure below:
+{
+  "is_the_agent_response_valid": [valid or invalid],
+  "reasoning":
+}
+"""
+
+
+def test_convert_auto_rater_response_to_score_valid():
+  evaluator = _create_test_evaluator_gemini(threshold=0.8)
+  auto_rater_response = """```json
+{
+  "is_the_agent_response_valid": "valid",
+  "reasoning": "The response is valid."
+}
+```"""
+  llm_response = LlmResponse(
+      content=genai_types.Content(
+          parts=[genai_types.Part(text=auto_rater_response)],
+          role="model",
+      )
+  )
+  score = evaluator.convert_auto_rater_response_to_score(llm_response)
+  assert score == 1.0
+
+
+def test_convert_auto_rater_response_to_score_invalid():
+  evaluator = _create_test_evaluator_gemini(threshold=0.8)
+  auto_rater_response = """```json
+{
+  "is_the_agent_response_valid": "invalid",
+  "reasoning": "The response is invalid."
+}
+```"""
+  llm_response = LlmResponse(
+      content=genai_types.Content(
+          parts=[genai_types.Part(text=auto_rater_response)],
+          role="model",
+      )
+  )
+  score = evaluator.convert_auto_rater_response_to_score(llm_response)
+  assert score == 0.0
+
+
+def test_convert_auto_rater_response_to_score_invalid_json():
+  evaluator = _create_test_evaluator_gemini(threshold=0.8)
+  llm_response = LlmResponse(
+      content=genai_types.Content(
+          parts=[genai_types.Part(text="invalid json")],
+          role="model",
+      )
+  )
+  score = evaluator.convert_auto_rater_response_to_score(llm_response)
+  assert score is None
+
+
+def test_convert_auto_rater_response_to_score_missing_key():
+  evaluator = _create_test_evaluator_gemini(threshold=0.8)
+  llm_response = LlmResponse(
+      content=genai_types.Content(
+          parts=[genai_types.Part(text="{}")],
+          role="model",
+      )
+  )
+  score = evaluator.convert_auto_rater_response_to_score(llm_response)
+  assert score is None
+
+
+def test_aggregate_per_invocation_samples_none_evaluated():
+  evaluator = _create_test_evaluator_gemini(threshold=0.5)
+
+  actual_invocation, expected_invocation = _create_test_invocations(
+      "candidate text", "reference text"
+  )
+
+  per_invocation_result_samples = [
+      PerInvocationResult(
+          actual_invocation=actual_invocation,
+          expected_invocation=expected_invocation,
+          score=None,
+          eval_status=EvalStatus.NOT_EVALUATED,
+      ),
+      PerInvocationResult(
+          actual_invocation=actual_invocation,
+          expected_invocation=expected_invocation,
+          score=None,
+          eval_status=EvalStatus.NOT_EVALUATED,
+      ),
+  ]
+
+  assert (
+      evaluator.aggregate_per_invocation_samples(per_invocation_result_samples)
+      == per_invocation_result_samples[0]
+  )
+
+
+def test_aggregate_per_invocation_samples_valid():
+  evaluator = _create_test_evaluator_gemini(threshold=0.5)
+
+  actual_invocation, expected_invocation = _create_test_invocations(
+      "candidate text", "reference text"
+  )
+
+  per_invocation_result_samples = [
+      PerInvocationResult(
+          actual_invocation=actual_invocation,
+          expected_invocation=expected_invocation,
+          score=1.0,
+          eval_status=EvalStatus.PASSED,
+      ),
+      PerInvocationResult(
+          actual_invocation=actual_invocation,
+          expected_invocation=expected_invocation,
+          score=1.0,
+          eval_status=EvalStatus.PASSED,
+      ),
+      PerInvocationResult(
+          actual_invocation=actual_invocation,
+          expected_invocation=expected_invocation,
+          score=0.0,
+          eval_status=EvalStatus.FAILED,
+      ),
+      PerInvocationResult(
+          actual_invocation=actual_invocation,
+          expected_invocation=expected_invocation,
+          score=0.0,
+          eval_status=EvalStatus.FAILED,
+      ),
+      PerInvocationResult(
+          actual_invocation=actual_invocation,
+          expected_invocation=expected_invocation,
+          score=1.0,
+          eval_status=EvalStatus.PASSED,
+      ),
+      PerInvocationResult(
+          actual_invocation=actual_invocation,
+          expected_invocation=expected_invocation,
+          score=1.0,
+          eval_status=EvalStatus.NOT_EVALUATED,
+      ),
+      PerInvocationResult(
+          actual_invocation=actual_invocation,
+          expected_invocation=expected_invocation,
+          score=None,
+          eval_status=EvalStatus.NOT_EVALUATED,
+      ),
+      PerInvocationResult(
+          actual_invocation=actual_invocation,
+          expected_invocation=expected_invocation,
+          score=0.0,
+          eval_status=EvalStatus.NOT_EVALUATED,
+      ),
+  ]
+
+  per_invocation_result = evaluator.aggregate_per_invocation_samples(
+      per_invocation_result_samples
+  )
+
+  assert per_invocation_result.score == 1.0
+  assert per_invocation_result.eval_status == EvalStatus.PASSED
+
+
+def test_aggregate_per_invocation_samples_invalid():
+  evaluator = _create_test_evaluator_gemini(threshold=0.5)
+
+  actual_invocation, expected_invocation = _create_test_invocations(
+      "candidate text", "reference text"
+  )
+
+  per_invocation_result_samples = [
+      PerInvocationResult(
+          actual_invocation=actual_invocation,
+          expected_invocation=expected_invocation,
+          score=0.0,
+          eval_status=EvalStatus.FAILED,
+      ),
+      PerInvocationResult(
+          actual_invocation=actual_invocation,
+          expected_invocation=expected_invocation,
+          score=1.0,
+          eval_status=EvalStatus.PASSED,
+      ),
+      PerInvocationResult(
+          actual_invocation=actual_invocation,
+          expected_invocation=expected_invocation,
+          score=0.0,
+          eval_status=EvalStatus.FAILED,
+      ),
+      PerInvocationResult(
+          actual_invocation=actual_invocation,
+          expected_invocation=expected_invocation,
+          score=0.0,
+          eval_status=EvalStatus.FAILED,
+      ),
+      PerInvocationResult(
+          actual_invocation=actual_invocation,
+          expected_invocation=expected_invocation,
+          score=1.0,
+          eval_status=EvalStatus.PASSED,
+      ),
+      PerInvocationResult(
+          actual_invocation=actual_invocation,
+          expected_invocation=expected_invocation,
+          score=1.0,
+          eval_status=EvalStatus.PASSED,
+      ),
+      PerInvocationResult(
+          actual_invocation=actual_invocation,
+          expected_invocation=expected_invocation,
+          score=1.0,
+          eval_status=EvalStatus.NOT_EVALUATED,
+      ),
+      PerInvocationResult(
+          actual_invocation=actual_invocation,
+          expected_invocation=expected_invocation,
+          score=None,
+          eval_status=EvalStatus.NOT_EVALUATED,
+      ),
+      PerInvocationResult(
+          actual_invocation=actual_invocation,
+          expected_invocation=expected_invocation,
+          score=0.0,
+          eval_status=EvalStatus.NOT_EVALUATED,
+      ),
+  ]
+
+  per_invocation_result = evaluator.aggregate_per_invocation_samples(
+      per_invocation_result_samples
+  )
+
+  assert per_invocation_result.score == 0.0
+  assert per_invocation_result.eval_status == EvalStatus.FAILED
+
+
+def test_aggregate_invocation_results():
+  evaluator = _create_test_evaluator_gemini(threshold=0.5)
+
+  actual_invocation, expected_invocation = _create_test_invocations(
+      "candidate text", "reference text"
+  )
+
+  per_invocation_results = [
+      PerInvocationResult(
+          actual_invocation=actual_invocation,
+          expected_invocation=expected_invocation,
+          score=1.0,
+          eval_status=EvalStatus.PASSED,
+      ),
+      PerInvocationResult(
+          actual_invocation=actual_invocation,
+          expected_invocation=expected_invocation,
+          score=1.0,
+          eval_status=EvalStatus.PASSED,
+      ),
+      PerInvocationResult(
+          actual_invocation=actual_invocation,
+          expected_invocation=expected_invocation,
+          score=0.0,
+          eval_status=EvalStatus.FAILED,
+      ),
+      PerInvocationResult(
+          actual_invocation=actual_invocation,
+          expected_invocation=expected_invocation,
+          score=0.0,
+          eval_status=EvalStatus.FAILED,
+      ),
+      PerInvocationResult(
+          actual_invocation=actual_invocation,
+          expected_invocation=expected_invocation,
+          score=None,
+          eval_status=EvalStatus.PASSED,
+      ),
+      PerInvocationResult(
+          actual_invocation=actual_invocation,
+          expected_invocation=expected_invocation,
+          score=100.0,
+          eval_status=EvalStatus.NOT_EVALUATED,
+      ),
+      PerInvocationResult(
+          actual_invocation=actual_invocation,
+          expected_invocation=expected_invocation,
+          score=None,
+          eval_status=EvalStatus.NOT_EVALUATED,
+      ),
+  ]
+
+  aggregated_result = evaluator.aggregate_invocation_results(
+      per_invocation_results
+  )
+
+  # Only 4 / 8 invocations are evaluated, and 2 / 4 are valid.
+  assert aggregated_result.overall_score == 0.5
+  assert aggregated_result.overall_eval_status == EvalStatus.PASSED
@@ -0,0 +1,221 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from typing import Optional
+from unittest.mock import MagicMock
+
+from google.adk.evaluation.eval_case import Invocation
+from google.adk.evaluation.eval_metrics import EvalMetric
+from google.adk.evaluation.eval_metrics import JudgeModelOptions
+from google.adk.evaluation.evaluator import EvalStatus
+from google.adk.evaluation.evaluator import EvaluationResult
+from google.adk.evaluation.evaluator import PerInvocationResult
+from google.adk.evaluation.llm_as_judge import LlmAsJudge
+from google.adk.evaluation.llm_as_judge_utils import get_eval_status
+from google.adk.evaluation.llm_as_judge_utils import get_text_from_content
+from google.adk.models.llm_response import LlmResponse
+from google.genai import types as genai_types
+import pytest
+
+
+class MockLlmAsJudge(LlmAsJudge):
+
+  def format_auto_rater_prompt(
+      self, actual_invocation: Invocation, expected_invocation: Invocation
+  ) -> str:
+    return "formatted prompt"
+
+  def convert_auto_rater_response_to_score(
+      self, llm_response: LlmResponse
+  ) -> Optional[float]:
+    return 1.0
+
+  def aggregate_per_invocation_samples(
+      self,
+      per_invocation_samples: list[PerInvocationResult],
+  ) -> PerInvocationResult:
+    return per_invocation_samples[0]
+
+  def aggregate_invocation_results(
+      self, per_invocation_results: list[PerInvocationResult]
+  ) -> EvaluationResult:
+    return EvaluationResult(
+        overall_score=1.0, overall_eval_status=EvalStatus.PASSED
+    )
+
+
+@pytest.fixture
+def mock_llm_as_judge():
+  return MockLlmAsJudge(
+      EvalMetric(
+          metric_name="test_metric",
+          threshold=0.5,
+          judge_model_options=JudgeModelOptions(
+              judge_model="gemini-2.5-flash",
+              judge_model_config=genai_types.GenerateContentConfig(),
+              num_samples=3,
+          ),
+      ),
+  )
+
+
+def test_get_text_from_content():
+  content = genai_types.Content(
+      parts=[
+          genai_types.Part(text="This is a test text."),
+          genai_types.Part(text="This is another test text."),
+      ],
+      role="model",
+  )
+  assert (
+      get_text_from_content(content)
+      == "This is a test text.\nThis is another test text."
+  )
+
+
+def test_get_eval_status():
+  assert get_eval_status(score=0.8, threshold=0.8) == EvalStatus.PASSED
+  assert get_eval_status(score=0.7, threshold=0.8) == EvalStatus.FAILED
+  assert get_eval_status(score=0.8, threshold=0.9) == EvalStatus.FAILED
+  assert get_eval_status(score=0.9, threshold=0.8) == EvalStatus.PASSED
+  assert get_eval_status(score=None, threshold=0.8) == EvalStatus.NOT_EVALUATED
+
+
+def test_llm_as_judge_init_missing_judge_model_options():
+  with pytest.raises(ValueError):
+    MockLlmAsJudge(
+        EvalMetric(metric_name="test_metric", threshold=0.8),
+    )
+
+
+def test_llm_as_judge_init_unregistered_model():
+  with pytest.raises(ValueError):
+    MockLlmAsJudge(
+        EvalMetric(
+            metric_name="test_metric",
+            threshold=0.8,
+            judge_model_options=JudgeModelOptions(
+                judge_model="unregistered_model",
+            ),
+        ),
+    )
+
+
+@pytest.fixture
+def mock_judge_model():
+  mock_judge_model = MagicMock()
+
+  async def mock_generate_content_async(llm_request):
+    yield LlmResponse(
+        content=genai_types.Content(
+            parts=[genai_types.Part(text="auto rater response")],
+        )
+    )
+
+  mock_judge_model.generate_content_async = mock_generate_content_async
+  return mock_judge_model
+
+
+@pytest.mark.asyncio
+async def test_evaluate_invocations_with_mock(
+    mock_llm_as_judge, mock_judge_model
+):
+  mock_llm_as_judge._judge_model = mock_judge_model
+
+  mock_format_auto_rater_prompt = MagicMock(
+      wraps=mock_llm_as_judge.format_auto_rater_prompt
+  )
+  mock_llm_as_judge.format_auto_rater_prompt = mock_format_auto_rater_prompt
+
+  mock_convert_auto_rater_response_to_score = MagicMock(
+      wraps=mock_llm_as_judge.convert_auto_rater_response_to_score
+  )
+  mock_llm_as_judge.convert_auto_rater_response_to_score = (
+      mock_convert_auto_rater_response_to_score
+  )
+
+  mock_aggregate_per_invocation_samples = MagicMock(
+      wraps=mock_llm_as_judge.aggregate_per_invocation_samples
+  )
+  mock_llm_as_judge.aggregate_per_invocation_samples = (
+      mock_aggregate_per_invocation_samples
+  )
+
+  mock_aggregate_invocation_results = MagicMock(
+      wraps=mock_llm_as_judge.aggregate_invocation_results
+  )
+  mock_llm_as_judge.aggregate_invocation_results = (
+      mock_aggregate_invocation_results
+  )
+
+  actual_invocations = [
+      Invocation(
+          invocation_id="id1",
+          user_content=genai_types.Content(
+              parts=[genai_types.Part(text="user content 1")],
+              role="user",
+          ),
+          final_response=genai_types.Content(
+              parts=[genai_types.Part(text="final response 1")],
+              role="model",
+          ),
+      ),
+      Invocation(
+          invocation_id="id2",
+          user_content=genai_types.Content(
+              parts=[genai_types.Part(text="user content 2")],
+              role="user",
+          ),
+          final_response=genai_types.Content(
+              parts=[genai_types.Part(text="final response 2")],
+              role="model",
+          ),
+      ),
+  ]
+  expected_invocations = [
+      Invocation(
+          invocation_id="id1",
+          user_content=genai_types.Content(
+              parts=[genai_types.Part(text="user content 1")],
+              role="user",
+          ),
+          final_response=genai_types.Content(
+              parts=[genai_types.Part(text="expected response 1")],
+              role="model",
+          ),
+      ),
+      Invocation(
+          invocation_id="id2",
+          user_content=genai_types.Content(
+              parts=[genai_types.Part(text="user content 2")],
+              role="user",
+          ),
+          final_response=genai_types.Content(
+              parts=[genai_types.Part(text="expected response 2")],
+              role="model",
+          ),
+      ),
+  ]
+
+  result = await mock_llm_as_judge.evaluate_invocations(
+      actual_invocations, expected_invocations
+  )
+
+  # Assertions
+  assert result.overall_score == 1.0
+  assert mock_llm_as_judge.format_auto_rater_prompt.call_count == 2
+  assert mock_llm_as_judge.convert_auto_rater_response_to_score.call_count == 6
+  assert mock_llm_as_judge.aggregate_invocation_results.call_count == 1