feat: Adds Rubric based final response evaluator

The evaluator uses a set of rubrics to assess the quality of the agent's final response. PiperOrigin-RevId: 811154498
2026-03-30 10:57:20 -07:00 · 2025-09-24 20:30:09 -07:00
parent 01923a9227
commit 5a485b01cd
16 changed files with 1969 additions and 24 deletions
@@ -47,3 +47,17 @@ class AppDetails(EvalBaseModel):
      default_factory=dict,
  )
  """A mapping from the agent name to the details of that agent."""
+
+  def get_developer_instructions(self, agent_name: str) -> str:
+    """Returns a string containing the developer instructions."""
+    if agent_name not in self.agent_details:
+      raise ValueError(f"`{agent_name}` not found in the agentic system.")
+
+    return self.agent_details[agent_name].instructions
+
+  def get_tools_by_agent_name(self) -> dict[str, genai_types.ToolListUnion]:
+    """Returns a dictionary of tools available to an agent in the App, keyed to the name of the Agent."""
+    return {
+        name: details.tool_declarations
+        for name, details in self.agent_details.items()
+    }
@@ -168,3 +168,54 @@ def get_all_tool_calls(
    )

  return tool_calls
+
+
+def get_all_tool_responses(
+    intermediate_data: Optional[IntermediateDataType],
+) -> list[genai_types.FunctionResponse]:
+  """A utility method to retrieve tools responses from intermediate data."""
+  if not intermediate_data:
+    return []
+
+  tool_responses = []
+  if isinstance(intermediate_data, IntermediateData):
+    tool_responses = intermediate_data.tool_responses
+  elif isinstance(intermediate_data, InvocationEvents):
+    # Go over each event in the list of events
+    for invocation_event in intermediate_data.invocation_events:
+      # Check if the event has content and some parts.
+      if invocation_event.content and invocation_event.content.parts:
+        for p in invocation_event.content.parts:
+          # For each part, we check if any of those part is a function response.
+          if p.function_response:
+            tool_responses.append(p.function_response)
+  else:
+    raise ValueError(
+        f"Unsupported type for intermediate_data `{intermediate_data}`"
+    )
+
+  return tool_responses
+
+
+ToolCallAndResponse: TypeAlias = tuple[
+    genai_types.FunctionCall, Optional[genai_types.FunctionResponse]
+]
+"""A Tuple representing a Function call and corresponding optional function response."""
+
+
+def get_all_tool_calls_with_responses(
+    intermediate_data: Optional[IntermediateDataType],
+) -> list[ToolCallAndResponse]:
+  """Returns tool calls with the corresponding responses, if available."""
+  tool_responses_by_call_id: dict[str, genai_types.FunctionResponse] = {
+      tool_response.id: tool_response
+      for tool_response in get_all_tool_responses(intermediate_data)
+  }
+
+  tool_call_and_responses: list[ToolCallAndResponse] = []
+
+  for tool_call in get_all_tool_calls(intermediate_data):
+    response = tool_responses_by_call_id.get(tool_call.id, None)
+    tool_call_and_responses.append((tool_call, response))
+
+  return tool_call_and_responses
@@ -48,6 +48,10 @@ class PrebuiltMetrics(Enum):

  FINAL_RESPONSE_MATCH_V2 = "final_response_match_v2"

+  RUBRIC_BASED_FINAL_RESPONSE_QUALITY_V1 = (
+      "rubric_based_final_response_quality_v1"
+  )
+

 MetricName: TypeAlias = Union[str, PrebuiltMetrics]
 Threshold: TypeAlias = float
@@ -23,6 +23,7 @@ from typing_extensions import TypeAlias
 from .eval_case import Invocation
 from .eval_metrics import BaseCriterion
 from .eval_metrics import EvalStatus
+from .eval_rubrics import RubricScore

 # Redefining the type here for backward compatibility.
 EvalStatus: TypeAlias = EvalStatus
@@ -35,6 +36,7 @@ class PerInvocationResult(BaseModel):
  expected_invocation: Invocation
  score: Optional[float] = None
  eval_status: EvalStatus = EvalStatus.NOT_EVALUATED
+  rubric_scores: Optional[list[RubricScore]] = None


 class EvaluationResult(BaseModel):
@@ -45,6 +47,10 @@ class EvaluationResult(BaseModel):
  """Overall status, based on each invocation."""

  per_invocation_results: list[PerInvocationResult] = []
+  """Detailed results per invocation."""
+
+  overall_rubric_scores: Optional[list[RubricScore]] = None
+  """Overall rubric, based on each invocation."""


 class Evaluator(ABC):
@@ -33,6 +33,7 @@ from .eval_metrics import MetricValueInfo
 from .eval_metrics import PrebuiltMetrics
 from .evaluator import EvaluationResult
 from .evaluator import PerInvocationResult
+from .llm_as_judge import AutoRaterScore
 from .llm_as_judge import LlmAsJudge
 from .llm_as_judge_utils import get_eval_status
 from .llm_as_judge_utils import get_text_from_content
@@ -179,17 +180,17 @@ class FinalResponseMatchV2Evaluator(LlmAsJudge):
  @override
  def convert_auto_rater_response_to_score(
      self, llm_response: LlmResponse
-  ) -> Optional[float]:
+  ) -> AutoRaterScore:
    response_text = get_text_from_content(llm_response.content)
    if response_text is None:
-      return None
+      return AutoRaterScore()
    label = _parse_critique(response_text)
    if label == Label.VALID:
-      return 1.0
+      return AutoRaterScore(score=1.0)
    elif label == Label.INVALID:
-      return 0.0
+      return AutoRaterScore(score=0.0)
    else:
-      return None
+      return AutoRaterScore()

  @override
  def aggregate_per_invocation_samples(
@@ -26,15 +26,22 @@ from ..models.llm_request import LlmRequest
 from ..models.llm_response import LlmResponse
 from ..models.registry import LLMRegistry
 from ..utils.context_utils import Aclosing
+from .common import EvalBaseModel
 from .eval_case import Invocation
 from .eval_metrics import BaseCriterion
 from .eval_metrics import EvalMetric
+from .eval_metrics import RubricScore
 from .evaluator import EvaluationResult
 from .evaluator import Evaluator
 from .evaluator import PerInvocationResult
 from .llm_as_judge_utils import get_eval_status


+class AutoRaterScore(EvalBaseModel):
+  score: Optional[float] = None
+  rubric_scores: Optional[list[RubricScore]] = None
+
+
 class LlmAsJudge(Evaluator):
  """Evaluator based on a LLM.

@@ -82,7 +89,7 @@ class LlmAsJudge(Evaluator):
  @abstractmethod
  def convert_auto_rater_response_to_score(
      self, auto_rater_response: LlmResponse
-  ) -> Optional[float]:
+  ) -> AutoRaterScore:
    """Parses auto_rater_response and returns the corresponding score, or None if the score cannot be determined."""

  @abstractmethod
@@ -126,15 +133,18 @@ class LlmAsJudge(Evaluator):
        ) as agen:
          async for llm_response in agen:
            # Non-streaming call, so there is only one response content.
-            score = self.convert_auto_rater_response_to_score(llm_response)
+            auto_rater_score = self.convert_auto_rater_response_to_score(
+                llm_response
+            )
            invocation_result_samples.append(
                PerInvocationResult(
                    actual_invocation=actual,
                    expected_invocation=expected,
-                    score=score,
+                    score=auto_rater_score.score,
                    eval_status=get_eval_status(
-                        score, self._criterion.threshold
+                        auto_rater_score.score, self._eval_metric.threshold
                    ),
+                    rubric_scores=auto_rater_score.rubric_scores,
                )
            )
      if not invocation_result_samples:
@@ -15,10 +15,17 @@
 from __future__ import annotations

 import enum
+import statistics
 from typing import Optional
+from typing import Union

 from google.genai import types as genai_types

+from .app_details import AppDetails
+from .common import EvalBaseModel
+from .eval_case import get_all_tool_calls_with_responses
+from .eval_case import IntermediateDataType
+from .eval_metrics import RubricScore
 from .evaluator import EvalStatus


@@ -46,3 +53,97 @@ def get_eval_status(score: Optional[float], threshold: float) -> EvalStatus:
  if score is None:
    return EvalStatus.NOT_EVALUATED
  return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED
+
+
+def get_average_rubric_score(
+    rubric_scores: list[RubricScore],
+) -> Optional[float]:
+  """Returns a single score value from the given list of rubric scores.
+
+  It is possible that none of the rubric score actually contain a score value,
+  if that happens then None is returned.
+
+  If non-zero score values are present, then a mean value is returned as the
+  aggregated value.
+  """
+  rubric_scores = [
+      rubric_score.score
+      for rubric_score in rubric_scores
+      if rubric_score.score is not None
+  ]
+
+  return statistics.mean(rubric_scores) if rubric_scores else None
+
+
+class _ToolDeclarations(EvalBaseModel):
+  """Internal data model used for serializing Tool declarations."""
+
+  tool_declarations: dict[str, genai_types.ToolListUnion]
+
+
+def get_tool_declarations_as_json_str(
+    app_details: AppDetails,
+) -> str:
+  """Returns a JSON string representation of Tool declarations.
+
+  The output of this method is usually intended to be sent to the LLM.
+  """
+  tool_declarations = _ToolDeclarations(
+      tool_declarations=app_details.get_tools_by_agent_name()
+  )
+  return tool_declarations.model_dump_json(
+      indent=2,
+      exclude_unset=True,
+      exclude_defaults=True,
+      exclude_none=True,
+  )
+
+
+class _ToolCallAndResponse(EvalBaseModel):
+  """Internal data model to capture one single tool call and response."""
+
+  step: int
+  tool_call: genai_types.FunctionCall
+  tool_response: Union[genai_types.FunctionResponse, str]
+
+
+class _ToolCallsAndResponses(EvalBaseModel):
+  """Internal data model used for serializing Tool call and responses."""
+
+  tool_calls_and_response: list[_ToolCallAndResponse]
+
+
+def get_tool_calls_and_responses_as_json_str(
+    intermediate_data: Optional[IntermediateDataType],
+) -> str:
+  """Returns a JSON string representation of tool calls and corresponding responses.
+
+  The output of this method is usually intended to be sent to the LLM.
+  """
+  raw_tool_calls_and_response = get_all_tool_calls_with_responses(
+      intermediate_data
+  )
+
+  if not raw_tool_calls_and_response:
+    return "No intermediate steps were taken."
+
+  tool_calls_and_responses = []
+  for idx, (tool_call, tool_response) in enumerate(raw_tool_calls_and_response):
+    tool_calls_and_responses.append(
+        _ToolCallAndResponse(
+            step=idx,
+            tool_call=tool_call,
+            tool_response=tool_response if tool_response else "None",
+        )
+    )
+
+  internal_tool_calls_and_responses = _ToolCallsAndResponses(
+      tool_calls_and_response=tool_calls_and_responses
+  )
+
+  return internal_tool_calls_and_responses.model_dump_json(
+      indent=2,
+      exclude_unset=True,
+      exclude_defaults=True,
+      exclude_none=True,
+  )
@@ -40,6 +40,7 @@ from .base_eval_service import InferenceStatus
 from .eval_case import Invocation
 from .eval_metrics import EvalMetric
 from .eval_metrics import EvalMetricResult
+from .eval_metrics import EvalMetricResultDetails
 from .eval_metrics import EvalMetricResultPerInvocation
 from .eval_result import EvalCaseResult
 from .eval_set import EvalCase
@@ -239,12 +240,15 @@ class LocalEvalService(BaseEvalService):
      )

      # Track overall scrore across all invocations.
+      eval_metric_result_details = EvalMetricResultDetails(
+          rubric_scores=evaluation_result.overall_rubric_scores
+      )
      overall_eval_metric_results.append(
          EvalMetricResult(
-              metric_name=eval_metric.metric_name,
-              threshold=eval_metric.threshold,
              score=evaluation_result.overall_score,
              eval_status=evaluation_result.overall_eval_status,
+              details=eval_metric_result_details,
+              **eval_metric.model_dump(),
          )
      )

@@ -262,12 +266,15 @@ class LocalEvalService(BaseEvalService):
          evaluation_result.per_invocation_results,
          eval_metric_result_per_invocation,
      ):
+        eval_metric_result_details = EvalMetricResultDetails(
+            rubric_scores=invocation_result.rubric_scores
+        )
        invocation.eval_metric_results.append(
            EvalMetricResult(
-                metric_name=eval_metric.metric_name,
-                threshold=eval_metric.threshold,
                score=invocation_result.score,
                eval_status=invocation_result.eval_status,
+                details=eval_metric_result_details,
+                **eval_metric.model_dump(),
            )
        )

@@ -24,6 +24,7 @@ from .eval_metrics import PrebuiltMetrics
 from .evaluator import Evaluator
 from .final_response_match_v2 import FinalResponseMatchV2Evaluator
 from .response_evaluator import ResponseEvaluator
+from .rubric_based_final_response_quality_v1 import RubricBasedFinalResponseQualityV1Evaluator
 from .safety_evaluator import SafetyEvaluatorV1
 from .trajectory_evaluator import TrajectoryEvaluator

@@ -111,6 +112,10 @@ def _get_default_metric_evaluator_registry() -> MetricEvaluatorRegistry:
      metric_info=FinalResponseMatchV2Evaluator.get_metric_info(),
      evaluator=FinalResponseMatchV2Evaluator,
  )
+  metric_evaluator_registry.register_evaluator(
+      metric_info=RubricBasedFinalResponseQualityV1Evaluator.get_metric_info(),
+      evaluator=RubricBasedFinalResponseQualityV1Evaluator,
+  )

  return metric_evaluator_registry

@@ -0,0 +1,73 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from google.adk.evaluation.app_details import AgentDetails
+from google.adk.evaluation.app_details import AppDetails
+from google.genai import types as genai_types
+from pytest import raises
+
+
+def test_get_developer_instructions_existing_agent():
+  agent_details = {
+      'agent1': AgentDetails(
+          name='agent1', instructions='instruction for agent1'
+      ),
+      'agent2': AgentDetails(
+          name='agent2', instructions='instruction for agent2'
+      ),
+  }
+  app_details = AppDetails(
+      agent_details=agent_details,
+  )
+
+  # Test for existing agent
+  instructions = app_details.get_developer_instructions('agent1')
+  assert instructions == 'instruction for agent1'
+
+
+def test_get_developer_instructions_non_existing_Agent():
+  agent_details = {
+      'agent1': AgentDetails(
+          name='agent1', instructions='instruction for agent1'
+      ),
+      'agent2': AgentDetails(
+          name='agent2', instructions='instruction for agent2'
+      ),
+  }
+  app_details = AppDetails(
+      agent_details=agent_details,
+  )
+
+  # Test for existing agent
+  with raises(ValueError, match='`agent3` not found in the agentic system.'):
+    app_details.get_developer_instructions('agent3')
+
+
+def test_get_tools_by_agent_name():
+  tool1 = genai_types.Tool(
+      function_declarations=[genai_types.FunctionDeclaration(name='tool1_func')]
+  )
+  agent_details = {
+      'agent1': AgentDetails(name='agent1', tool_declarations=[tool1]),
+      'agent2': AgentDetails(name='agent2', tool_declarations=[]),
+  }
+  app_details = AppDetails(
+      agent_details=agent_details,
+  )
+
+  tools = app_details.get_tools_by_agent_name()
+  expected_tools = {'agent1': [tool1], 'agent2': []}
+  assert tools == expected_tools
@@ -15,6 +15,8 @@
 from __future__ import annotations

 from google.adk.evaluation.eval_case import get_all_tool_calls
+from google.adk.evaluation.eval_case import get_all_tool_calls_with_responses
+from google.adk.evaluation.eval_case import get_all_tool_responses
 from google.adk.evaluation.eval_case import IntermediateData
 from google.adk.evaluation.eval_case import InvocationEvent
 from google.adk.evaluation.eval_case import InvocationEvents
@@ -97,3 +99,150 @@ def test_get_all_tool_calls_with_unsupported_type():
      ValueError, match='Unsupported type for intermediate_data'
  ):
    get_all_tool_calls('this is not a valid type')
+
+
+def test_get_all_tool_responses_with_none_input():
+  """Tests that an empty list is returned when intermediate_data is None."""
+  assert get_all_tool_responses(None) == []
+
+
+def test_get_all_tool_responses_with_empty_invocation_events():
+  """Tests InvocationEvents with an empty list of events."""
+  intermediate_data = InvocationEvents(invocation_events=[])
+  assert get_all_tool_responses(intermediate_data) == []
+
+
+def test_get_all_tool_responses_with_invocation_events_no_tools():
+  """Tests InvocationEvents containing events without any tool responses."""
+  invocation_event = InvocationEvent(
+      author='agent',
+      content=genai_types.Content(
+          parts=[genai_types.Part(text='Thinking...')], role='model'
+      ),
+  )
+  intermediate_data = InvocationEvents(invocation_events=[invocation_event])
+  assert get_all_tool_responses(intermediate_data) == []
+
+
+def test_get_all_tool_responses_with_invocation_events():
+  """Tests that tool responses are correctly extracted from a InvocationEvents object."""
+  tool_response1 = genai_types.FunctionResponse(
+      name='search', response={'result': 'weather is good'}
+  )
+  tool_response2 = genai_types.FunctionResponse(
+      name='lookup', response={'id': '123'}
+  )
+  invocation_event1 = InvocationEvent(
+      author='agent1',
+      content=genai_types.Content(
+          parts=[genai_types.Part(function_response=tool_response1)],
+          role='model',
+      ),
+  )
+  invocation_event2 = InvocationEvent(
+      author='agent2',
+      content=genai_types.Content(
+          parts=[
+              genai_types.Part(text='Found something.'),
+              genai_types.Part(function_response=tool_response2),
+          ],
+          role='model',
+      ),
+  )
+  intermediate_data = InvocationEvents(
+      invocation_events=[invocation_event1, invocation_event2]
+  )
+  assert get_all_tool_responses(intermediate_data) == [
+      tool_response1,
+      tool_response2,
+  ]
+
+
+def test_get_all_tool_responses_with_unsupported_type():
+  """Tests that a ValueError is raised for unsupported intermediate_data types."""
+  with pytest.raises(
+      ValueError, match='Unsupported type for intermediate_data'
+  ):
+    get_all_tool_responses('this is not a valid type')
+
+
+def test_get_all_tool_calls_with_responses_with_none_input():
+  """Tests that an empty list is returned when intermediate_data is None."""
+  assert get_all_tool_calls_with_responses(None) == []
+
+
+def test_get_all_tool_calls_with_responses_with_intermediate_data_no_tool_calls():
+  """Tests get_all_tool_calls_with_responses with IntermediateData with no tool calls."""
+  # No tool calls
+  intermediate_data = IntermediateData(tool_uses=[], tool_responses=[])
+  assert get_all_tool_calls_with_responses(intermediate_data) == []
+
+
+def test_get_all_tool_calls_with_responses_with_intermediate_data_with_tool_calls():
+  """Tests get_all_tool_calls_with_responses with IntermediateData with tools."""
+  # With matching and non-matching tool calls
+  tool_call1 = genai_types.FunctionCall(
+      name='search', args={'query': 'weather'}, id='call1'
+  )
+  tool_response1 = genai_types.FunctionResponse(
+      name='search', response={'result': 'sunny'}, id='call1'
+  )
+  tool_call2 = genai_types.FunctionCall(
+      name='lookup', args={'id': '123'}, id='call2'
+  )
+  intermediate_data = IntermediateData(
+      tool_uses=[tool_call1, tool_call2], tool_responses=[tool_response1]
+  )
+  assert get_all_tool_calls_with_responses(intermediate_data) == [
+      (tool_call1, tool_response1),
+      (tool_call2, None),
+  ]
+
+
+def test_get_all_tool_calls_with_responses_with_steps_no_tool_calls():
+  """Tests get_all_tool_calls_with_responses with Steps that don't have tool calls."""
+  # No tool calls
+  intermediate_data = InvocationEvents(invocation_events=[])
+  assert get_all_tool_calls_with_responses(intermediate_data) == []
+
+
+def test_get_all_tool_calls_with_responses_with_invocation_events():
+  """Tests get_all_tool_calls_with_responses with InvocationEvents."""
+  # No tools
+  intermediate_data = InvocationEvents(invocation_events=[])
+  assert get_all_tool_calls_with_responses(intermediate_data) == []
+
+  # With matching and non-matching tool calls
+  tool_call1 = genai_types.FunctionCall(
+      name='search', args={'query': 'weather'}, id='call1'
+  )
+  tool_response1 = genai_types.FunctionResponse(
+      name='search', response={'result': 'sunny'}, id='call1'
+  )
+  tool_call2 = genai_types.FunctionCall(
+      name='lookup', args={'id': '123'}, id='call2'
+  )
+  invocation_event1 = InvocationEvent(
+      author='agent',
+      content=genai_types.Content(
+          parts=[
+              genai_types.Part(function_call=tool_call1),
+              genai_types.Part(function_call=tool_call2),
+          ],
+          role='model',
+      ),
+  )
+  invocation_event2 = InvocationEvent(
+      author='tool',
+      content=genai_types.Content(
+          parts=[genai_types.Part(function_response=tool_response1)],
+          role='tool',
+      ),
+  )
+  intermediate_data = InvocationEvents(
+      invocation_events=[invocation_event1, invocation_event2]
+  )
+  assert get_all_tool_calls_with_responses(intermediate_data) == [
+      (tool_call1, tool_response1),
+      (tool_call2, None),
+  ]
@@ -17,12 +17,13 @@ from __future__ import annotations
 from google.adk.evaluation.eval_case import Invocation
 from google.adk.evaluation.eval_metrics import BaseCriterion
 from google.adk.evaluation.eval_metrics import EvalMetric
+from google.adk.evaluation.eval_metrics import EvalStatus
 from google.adk.evaluation.eval_metrics import JudgeModelOptions
 from google.adk.evaluation.eval_metrics import PrebuiltMetrics
-from google.adk.evaluation.evaluator import EvalStatus
 from google.adk.evaluation.evaluator import PerInvocationResult
 from google.adk.evaluation.final_response_match_v2 import _parse_critique
 from google.adk.evaluation.final_response_match_v2 import FinalResponseMatchV2Evaluator
+from google.adk.evaluation.llm_as_judge import AutoRaterScore
 from google.adk.evaluation.llm_as_judge_utils import Label
 from google.adk.models.llm_response import LlmResponse
 from google.genai import types as genai_types
@@ -206,8 +207,10 @@ def test_convert_auto_rater_response_to_score_valid():
          role="model",
      )
  )
-  score = evaluator.convert_auto_rater_response_to_score(llm_response)
-  assert score == 1.0
+  auto_rater_score = evaluator.convert_auto_rater_response_to_score(
+      llm_response
+  )
+  assert auto_rater_score == AutoRaterScore(score=1.0)


 def test_convert_auto_rater_response_to_score_invalid():
@@ -224,8 +227,10 @@ def test_convert_auto_rater_response_to_score_invalid():
          role="model",
      )
  )
-  score = evaluator.convert_auto_rater_response_to_score(llm_response)
-  assert score == 0.0
+  auto_rater_score = evaluator.convert_auto_rater_response_to_score(
+      llm_response
+  )
+  assert auto_rater_score == AutoRaterScore(score=0.0)


 def test_convert_auto_rater_response_to_score_invalid_json():
@@ -236,8 +241,10 @@ def test_convert_auto_rater_response_to_score_invalid_json():
          role="model",
      )
  )
-  score = evaluator.convert_auto_rater_response_to_score(llm_response)
-  assert score is None
+  auto_rater_score = evaluator.convert_auto_rater_response_to_score(
+      llm_response
+  )
+  assert auto_rater_score == AutoRaterScore()


 def test_convert_auto_rater_response_to_score_missing_key():
@@ -248,8 +255,10 @@ def test_convert_auto_rater_response_to_score_missing_key():
          role="model",
      )
  )
-  score = evaluator.convert_auto_rater_response_to_score(llm_response)
-  assert score is None
+  auto_rater_score = evaluator.convert_auto_rater_response_to_score(
+      llm_response
+  )
+  assert auto_rater_score == AutoRaterScore()


 def test_aggregate_per_invocation_samples_none_evaluated():
@@ -24,6 +24,7 @@ from google.adk.evaluation.eval_metrics import LlmAsAJudgeCriterion
 from google.adk.evaluation.evaluator import EvalStatus
 from google.adk.evaluation.evaluator import EvaluationResult
 from google.adk.evaluation.evaluator import PerInvocationResult
+from google.adk.evaluation.llm_as_judge import AutoRaterScore
 from google.adk.evaluation.llm_as_judge import LlmAsJudge
 from google.adk.evaluation.llm_as_judge_utils import get_eval_status
 from google.adk.evaluation.llm_as_judge_utils import get_text_from_content
@@ -41,8 +42,8 @@ class MockLlmAsJudge(LlmAsJudge):

  def convert_auto_rater_response_to_score(
      self, llm_response: LlmResponse
-  ) -> Optional[float]:
-    return 1.0
+  ) -> AutoRaterScore:
+    return AutoRaterScore(score=1.0)

  def aggregate_per_invocation_samples(
      self,
@@ -0,0 +1,290 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import json
+
+from google.adk.evaluation.app_details import AgentDetails
+from google.adk.evaluation.app_details import AppDetails
+from google.adk.evaluation.eval_case import IntermediateData
+from google.adk.evaluation.eval_case import InvocationEvent
+from google.adk.evaluation.eval_case import InvocationEvents
+from google.adk.evaluation.eval_rubrics import RubricScore
+from google.adk.evaluation.evaluator import EvalStatus
+from google.adk.evaluation.llm_as_judge_utils import get_average_rubric_score
+from google.adk.evaluation.llm_as_judge_utils import get_eval_status
+from google.adk.evaluation.llm_as_judge_utils import get_text_from_content
+from google.adk.evaluation.llm_as_judge_utils import get_tool_calls_and_responses_as_json_str
+from google.adk.evaluation.llm_as_judge_utils import get_tool_declarations_as_json_str
+from google.genai import types as genai_types
+
+
+def test_get_text_from_content_with_none():
+  """Tests get_text_from_content with None as input."""
+  assert get_text_from_content(None) is None
+
+
+def test_get_text_from_content_with_content_and_none_parts():
+  """Tests get_text_from_content with Content that has None for parts."""
+  content = genai_types.Content(parts=None)
+  assert get_text_from_content(content) is None
+
+
+def test_get_text_from_content_with_empty_parts():
+  """Tests get_text_from_content with an empty parts list."""
+  content = genai_types.Content(parts=[])
+  assert get_text_from_content(content) == None
+
+
+def test_get_text_from_content_with_parts_but_no_text():
+  """Tests get_text_from_content with parts that do not contain text."""
+  content = genai_types.Content(
+      parts=[
+          genai_types.Part(
+              function_call=genai_types.FunctionCall(name="test_func")
+          )
+      ]
+  )
+  assert get_text_from_content(content) == ""
+
+
+def test_get_text_from_content_with_single_text_part():
+  """Tests get_text_from_content with a single text part."""
+  content = genai_types.Content(parts=[genai_types.Part(text="Hello")])
+  assert get_text_from_content(content) == "Hello"
+
+
+def test_get_text_from_content_with_multiple_text_parts():
+  """Tests get_text_from_content with multiple text parts."""
+  content = genai_types.Content(
+      parts=[genai_types.Part(text="Hello"), genai_types.Part(text="World")]
+  )
+  assert get_text_from_content(content) == "Hello\nWorld"
+
+
+def test_get_text_from_content_with_mixed_parts():
+  """Tests get_text_from_content with a mix of text and non-text parts."""
+  content = genai_types.Content(
+      parts=[
+          genai_types.Part(text="Hello"),
+          genai_types.Part(
+              function_call=genai_types.FunctionCall(name="test_func")
+          ),
+          genai_types.Part(text="World"),
+      ]
+  )
+  assert get_text_from_content(content) == "Hello\nWorld"
+
+
+def test_get_eval_status_with_none_score():
+  """Tests get_eval_status returns NOT_EVALUATED for a None score."""
+  assert get_eval_status(score=None, threshold=0.5) == EvalStatus.NOT_EVALUATED
+
+
+def test_get_eval_status_when_score_is_greater_than_threshold():
+  """Tests get_eval_status returns PASSED when score > threshold."""
+  assert get_eval_status(score=0.8, threshold=0.5) == EvalStatus.PASSED
+
+
+def test_get_eval_status_when_score_is_equal_to_threshold():
+  """Tests get_eval_status returns PASSED when score == threshold."""
+  assert get_eval_status(score=0.5, threshold=0.5) == EvalStatus.PASSED
+
+
+def test_get_eval_status_when_score_is_less_than_threshold():
+  """Tests get_eval_status returns FAILED when score < threshold."""
+  assert get_eval_status(score=0.4, threshold=0.5) == EvalStatus.FAILED
+
+
+def test_get_average_rubric_score_with_empty_list():
+  """Tests get_average_rubric_score returns None for an empty list."""
+  assert get_average_rubric_score([]) is None
+
+
+def test_get_average_rubric_score_with_all_none_scores():
+  """Tests get_average_rubric_score returns None when all scores are None."""
+  rubric_scores = [
+      RubricScore(rubric_id="1", score=None),
+      RubricScore(rubric_id="2", score=None),
+  ]
+  assert get_average_rubric_score(rubric_scores) is None
+
+
+def test_get_average_rubric_score_with_single_score():
+  """Tests get_average_rubric_score with a single valid score."""
+  rubric_scores = [RubricScore(rubric_id="1", score=0.8)]
+  assert get_average_rubric_score(rubric_scores) == 0.8
+
+
+def test_get_average_rubric_score_with_multiple_scores():
+  """Tests get_average_rubric_score with multiple valid scores."""
+  rubric_scores = [
+      RubricScore(rubric_id="1", score=0.8),
+      RubricScore(rubric_id="2", score=0.6),
+  ]
+  assert get_average_rubric_score(rubric_scores) == 0.7
+
+
+def test_get_average_rubric_score_with_mixed_scores():
+  """Tests get_average_rubric_score with a mix of valid and None scores."""
+  rubric_scores = [
+      RubricScore(rubric_id="1", score=0.8),
+      RubricScore(rubric_id="2", score=None),
+      RubricScore(rubric_id="3", score=0.6),
+  ]
+  assert get_average_rubric_score(rubric_scores) == 0.7
+
+
+def test_get_tool_declarations_as_json_str_with_no_agents():
+  """Tests get_tool_declarations_as_json_str with no agents."""
+  app_details = AppDetails(agent_details={})
+  expected_json = {"tool_declarations": {}}
+  actual_json_str = get_tool_declarations_as_json_str(app_details)
+  assert json.loads(actual_json_str) == expected_json
+
+
+def test_get_tool_declarations_as_json_str_with_agent_no_tools():
+  """Tests get_tool_declarations_as_json_str with an agent that has no tools."""
+  agent_details = {"agent1": AgentDetails(name="agent1", tool_declarations=[])}
+  app_details = AppDetails(agent_details=agent_details)
+  expected_json = {"tool_declarations": {"agent1": []}}
+  actual_json_str = get_tool_declarations_as_json_str(app_details)
+  assert json.loads(actual_json_str) == expected_json
+
+
+def test_get_tool_declarations_as_json_str_with_agent_with_tools():
+  """Tests get_tool_declarations_as_json_str with an agent that has tools."""
+  tool1 = genai_types.Tool(
+      function_declarations=[
+          genai_types.FunctionDeclaration(
+              name="test_func", description="A test function."
+          )
+      ]
+  )
+  agent_details = {
+      "agent1": AgentDetails(name="agent1", tool_declarations=[tool1])
+  }
+  app_details = AppDetails(agent_details=agent_details)
+  expected_json = {
+      "tool_declarations": {
+          "agent1": [{
+              "function_declarations": [{
+                  "name": "test_func",
+                  "description": "A test function.",
+              }]
+          }]
+      }
+  }
+  actual_json_str = get_tool_declarations_as_json_str(app_details)
+  assert json.loads(actual_json_str) == expected_json
+
+
+def test_get_tool_declarations_as_json_str_with_multiple_agents():
+  """Tests get_tool_declarations_as_json_str with multiple agents."""
+  tool1 = genai_types.Tool(
+      function_declarations=[
+          genai_types.FunctionDeclaration(
+              name="test_func1", description="A test function 1."
+          )
+      ]
+  )
+  agent_details = {
+      "agent1": AgentDetails(name="agent1", tool_declarations=[tool1]),
+      "agent2": AgentDetails(name="agent2", tool_declarations=[]),
+  }
+  app_details = AppDetails(agent_details=agent_details)
+  expected_json = {
+      "tool_declarations": {
+          "agent1": [{
+              "function_declarations": [{
+                  "name": "test_func1",
+                  "description": "A test function 1.",
+              }]
+          }],
+          "agent2": [],
+      }
+  }
+  actual_json_str = get_tool_declarations_as_json_str(app_details)
+  assert json.loads(actual_json_str) == expected_json
+
+
+def test_get_tool_calls_and_responses_as_json_str_with_none():
+  """Tests get_tool_calls_and_responses_as_json_str with None."""
+  assert (
+      get_tool_calls_and_responses_as_json_str(None)
+      == "No intermediate steps were taken."
+  )
+
+
+def test_get_tool_calls_and_responses_as_json_str_with_intermediate_data_no_tools():
+  """Tests get_tool_calls_and_responses_as_json_str with IntermediateData and no tools."""
+  intermediate_data = IntermediateData(tool_uses=[], tool_responses=[])
+  assert (
+      get_tool_calls_and_responses_as_json_str(intermediate_data)
+      == "No intermediate steps were taken."
+  )
+
+  intermediate_data = InvocationEvents(invocation_events=[])
+  assert (
+      get_tool_calls_and_responses_as_json_str(intermediate_data)
+      == "No intermediate steps were taken."
+  )
+
+
+def test_get_tool_calls_and_responses_as_json_str_with_invocation_events_multiple_calls():
+  """Tests get_tool_calls_and_responses_as_json_str with multiple calls in InvocationEvents."""
+  tool_call1 = genai_types.FunctionCall(name="func1", args={}, id="call1")
+  tool_call2 = genai_types.FunctionCall(name="func2", args={}, id="call2")
+  tool_response1 = genai_types.FunctionResponse(
+      name="func1", response={"status": "ok"}, id="call1"
+  )
+  invocation_event1 = InvocationEvent(
+      author="agent",
+      content=genai_types.Content(
+          parts=[
+              genai_types.Part(function_call=tool_call1),
+              genai_types.Part(function_call=tool_call2),
+          ]
+      ),
+  )
+  invocation_event2 = InvocationEvent(
+      author="tool",
+      content=genai_types.Content(
+          parts=[genai_types.Part(function_response=tool_response1)]
+      ),
+  )
+  intermediate_data = InvocationEvents(
+      invocation_events=[invocation_event1, invocation_event2]
+  )
+  json_str = get_tool_calls_and_responses_as_json_str(intermediate_data)
+  expected_json = {
+      "tool_calls_and_response": [
+          {
+              "step": 0,
+              "tool_call": {"name": "func1", "args": {}, "id": "call1"},
+              "tool_response": {
+                  "name": "func1",
+                  "response": {"status": "ok"},
+                  "id": "call1",
+              },
+          },
+          {
+              "step": 1,
+              "tool_call": {"name": "func2", "args": {}, "id": "call2"},
+              "tool_response": "None",
+          },
+      ]
+  }
+  assert json.loads(json_str) == expected_json