chore: Marked expected_invocation as optional field on evaluator interface

ADK already has a set of metrics that don't rely expected_invocations. Also, for eval cases with conversation scenario, this would be the main line case. PiperOrigin-RevId: 825101481
2026-03-30 10:57:20 -07:00 · 2025-10-28 10:27:06 -07:00
parent 9ab17f2afd
commit b17c8f19e5
15 changed files with 282 additions and 102 deletions
@@ -210,21 +210,23 @@ def pretty_print_eval_result(eval_result: EvalCaseResult):

  data = []
  for per_invocation_result in eval_result.eval_metric_result_per_invocation:
+    actual_invocation = per_invocation_result.actual_invocation
+    expected_invocation = per_invocation_result.expected_invocation
    row_data = {
-        "prompt": _convert_content_to_text(
-            per_invocation_result.expected_invocation.user_content
-        ),
+        "prompt": _convert_content_to_text(actual_invocation.user_content),
        "expected_response": _convert_content_to_text(
-            per_invocation_result.expected_invocation.final_response
+            expected_invocation.final_response if expected_invocation else None
        ),
        "actual_response": _convert_content_to_text(
-            per_invocation_result.actual_invocation.final_response
+            actual_invocation.final_response
        ),
        "expected_tool_calls": _convert_tool_calls_to_text(
-            per_invocation_result.expected_invocation.intermediate_data
+            expected_invocation.intermediate_data
+            if expected_invocation
+            else None
        ),
        "actual_tool_calls": _convert_tool_calls_to_text(
-            per_invocation_result.actual_invocation.intermediate_data
+            actual_invocation.intermediate_data
        ),
    }
    for metric_result in per_invocation_result.eval_metric_results:
@@ -216,10 +216,11 @@ class EvalMetricResultPerInvocation(EvalBaseModel):
      )
  )

-  expected_invocation: Invocation = Field(
+  expected_invocation: Optional[Invocation] = Field(
+      default=None,
      description=(
          "The expected invocation, usually the reference or golden invocation."
-      )
+      ),
  )

  eval_metric_results: list[EvalMetricResult] = Field(
@@ -33,7 +33,7 @@ class PerInvocationResult(BaseModel):
  """Metric evaluation score per invocation."""

  actual_invocation: Invocation
-  expected_invocation: Invocation
+  expected_invocation: Optional[Invocation] = None
  score: Optional[float] = None
  eval_status: EvalStatus = EvalStatus.NOT_EVALUATED
  rubric_scores: Optional[list[RubricScore]] = None
@@ -61,7 +61,16 @@ class Evaluator(ABC):
  def evaluate_invocations(
      self,
      actual_invocations: list[Invocation],
-      expected_invocations: list[Invocation],
+      expected_invocations: Optional[list[Invocation]],
  ) -> EvaluationResult:
-    """Returns EvaluationResult after performing evaluations using actual and expected invocations."""
+    """Returns EvaluationResult after performing evaluations using actual and expected invocations.
+
+    Args:
+      actual_invocations: These are the invocations that are obtained from the
+        agent under test.
+      expected_invocations: An optional list of invocations, if specified,
+        usually act as a benchmark/golden response. If these are specified
+        usually the expectation is that the length of this list and actual
+        invocaiton is the same.
+    """
    raise NotImplementedError()
@@ -59,8 +59,11 @@ class RougeEvaluator(Evaluator):
  def evaluate_invocations(
      self,
      actual_invocations: list[Invocation],
-      expected_invocations: list[Invocation],
+      expected_invocations: Optional[list[Invocation]],
  ) -> EvaluationResult:
+    if expected_invocations is None:
+      raise ValueError("expected_invocations is required for this metric.")
+
    total_score = 0.0
    num_invocations = 0
    per_invocation_results = []
@@ -147,7 +147,11 @@ class FinalResponseMatchV2Evaluator(LlmAsJudge):
      self,
      eval_metric: EvalMetric,
  ):
-    super().__init__(eval_metric, FinalResponseMatchV2Evaluator.criterion_type)
+    super().__init__(
+        eval_metric,
+        FinalResponseMatchV2Evaluator.criterion_type,
+        expected_invocations_required=True,
+    )
    self._auto_rater_prompt_template = _FINAL_RESPONSE_MATCH_V2_PROMPT

  @staticmethod
@@ -166,8 +170,13 @@ class FinalResponseMatchV2Evaluator(LlmAsJudge):

  @override
  def format_auto_rater_prompt(
-      self, actual_invocation: Invocation, expected_invocation: Invocation
+      self,
+      actual_invocation: Invocation,
+      expected_invocation: Optional[Invocation],
  ) -> str:
+    if expected_invocation is None:
+      raise ValueError("expected_invocation is required for this metric.")
+
    reference = get_text_from_content(expected_invocation.final_response)
    response = get_text_from_content(actual_invocation.final_response)
    user_prompt = get_text_from_content(expected_invocation.user_content)
@@ -395,7 +395,8 @@ class HallucinationsV1Evaluator(Evaluator):
        },
        {
          "name": "get_weather",
-          "description": '''Gets the weather of the given place at the given time.
+          "description": '''Gets the weather of the given place at the given
+          time.

    Args:
      location: The location for which to retrieve weather information.
@@ -408,7 +409,8 @@ class HallucinationsV1Evaluator(Evaluator):
            "type": "object",
            "properties": {
              "location": {
-                "description": "The location for which to retrieve weather information.",
+                "description": "The location for which to retrieve weather
+                information.",
                "type": "string"
              },
              "time": {
@@ -711,8 +713,15 @@ class HallucinationsV1Evaluator(Evaluator):
  async def evaluate_invocations(
      self,
      actual_invocations: list[Invocation],
-      expected_invocations: list[Invocation],
+      expected_invocations: Optional[list[Invocation]],
  ) -> EvaluationResult:
+    # expected_invocations are not required by the metric and if they are not
+    # supplied, we provide an a list of None to rest of the code.
+    expected_invocations = (
+        [None] * len(actual_invocations)
+        if expected_invocations is None
+        else expected_invocations
+    )
    per_invocation_results = []
    for actual, expected in zip(actual_invocations, expected_invocations):
      step_evaluations = self._get_steps_to_evaluate(actual)
@@ -60,9 +60,13 @@ class LlmAsJudge(Evaluator):
  """

  def __init__(
-      self, eval_metric: EvalMetric, criterion_type: type[BaseCriterion]
+      self,
+      eval_metric: EvalMetric,
+      criterion_type: type[BaseCriterion],
+      expected_invocations_required=False,
  ):
    self._eval_metric = eval_metric
+    self._expected_invocations_required = expected_invocations_required

    expected_criterion_type_error = ValueError(
        f"`{eval_metric.metric_name}` metric expects a criterion of type"
@@ -84,7 +88,7 @@ class LlmAsJudge(Evaluator):

  @abstractmethod
  def format_auto_rater_prompt(
-      self, actual: Invocation, expected: Invocation
+      self, actual: Invocation, expected: Optional[Invocation]
  ) -> str:
    """Formats the auto-rater prompt to evaluate the given invocation."""

@@ -112,8 +116,19 @@ class LlmAsJudge(Evaluator):
  async def evaluate_invocations(
      self,
      actual_invocations: list[Invocation],
-      expected_invocations: list[Invocation],
+      expected_invocations: Optional[list[Invocation]],
  ) -> EvaluationResult:
+    if self._expected_invocations_required and expected_invocations is None:
+      raise ValueError("expected_invocations is needed by this metric.")
+
+    # If expected_invocation are not required by the metric and if they are not
+    # supplied, we provide an a list of None.
+    expected_invocations = (
+        [None] * len(actual_invocations)
+        if expected_invocations is None
+        else expected_invocations
+    )
+
    per_invocation_results = []
    for actual, expected in zip(actual_invocations, expected_invocations):
      auto_rater_prompt = self.format_auto_rater_prompt(actual, expected)
@@ -22,8 +22,6 @@ from typing import Callable
 from typing import Optional
 import uuid

-from google.genai.types import Content
-from google.genai.types import Part
 from typing_extensions import override

 from ..agents.base_agent import BaseAgent
@@ -51,6 +49,7 @@ from .eval_sets_manager import EvalSetsManager
 from .evaluation_generator import EvaluationGenerator
 from .evaluator import EvalStatus
 from .evaluator import EvaluationResult
+from .evaluator import PerInvocationResult
 from .metric_evaluator_registry import DEFAULT_METRIC_EVALUATOR_REGISTRY
 from .metric_evaluator_registry import MetricEvaluatorRegistry
 from .user_simulator_provider import UserSimulatorProvider
@@ -222,43 +221,9 @@ class LocalEvalService(BaseEvalService):
        else 'test_user_id'
    )

-    if eval_case.conversation_scenario:
-      logger.warning(
-          'Skipping evaluation of variable-length conversation scenario in eval'
-          ' set/case %s/%s.',
-          inference_result.eval_set_id,
-          inference_result.eval_case_id,
-      )
-      for actual_invocation in inference_result.inferences:
-        eval_metric_result_per_invocation.append(
-            EvalMetricResultPerInvocation(
-                actual_invocation=actual_invocation,
-                expected_invocation=Invocation(
-                    user_content=actual_invocation.user_content,
-                    final_response=Content(
-                        parts=[Part(text='N/A')], role='model'
-                    ),
-                ),
-            )
-        )
-      eval_case_result = EvalCaseResult(
-          eval_set_file=inference_result.eval_set_id,
-          eval_set_id=inference_result.eval_set_id,
-          eval_id=inference_result.eval_case_id,
-          final_eval_status=EvalStatus.NOT_EVALUATED,
-          overall_eval_metric_results=overall_eval_metric_results,
-          eval_metric_result_per_invocation=eval_metric_result_per_invocation,
-          session_id=inference_result.session_id,
-          session_details=await self._session_service.get_session(
-              app_name=inference_result.app_name,
-              user_id=user_id,
-              session_id=inference_result.session_id,
-          ),
-          user_id=user_id,
-      )
-      return (inference_result, eval_case_result)
-
-    if len(inference_result.inferences) != len(eval_case.conversation):
+    if eval_case.conversation_scenario is None and len(
+        inference_result.inferences
+    ) != len(eval_case.conversation):
      raise ValueError(
          'Inferences should match conversations in eval case. Found'
          f'{len(inference_result.inferences)} inferences '
@@ -266,13 +231,13 @@ class LocalEvalService(BaseEvalService):
      )

    # Pre-creating the EvalMetricResults entries for each invocation.
-    for actual, expected in zip(
-        inference_result.inferences, eval_case.conversation
-    ):
+    for idx, actual in enumerate(inference_result.inferences):
      eval_metric_result_per_invocation.append(
          EvalMetricResultPerInvocation(
              actual_invocation=actual,
-              expected_invocation=expected,
+              expected_invocation=eval_case.conversation[idx]
+              if eval_case.conversation
+              else None,
              # We will fill this as we evaluate each metric per invocation.
              eval_metric_results=[],
          )
@@ -280,11 +245,27 @@ class LocalEvalService(BaseEvalService):

    for eval_metric in evaluate_config.eval_metrics:
      # Perform evaluation of the metric.
-      evaluation_result = await self._evaluate_metric(
-          eval_metric=eval_metric,
-          actual_invocations=inference_result.inferences,
-          expected_invocations=eval_case.conversation,
-      )
+      try:
+        evaluation_result = await self._evaluate_metric(
+            eval_metric=eval_metric,
+            actual_invocations=inference_result.inferences,
+            expected_invocations=eval_case.conversation,
+        )
+      except Exception as e:
+        # We intentionally catch the Exception as we don't want failures to
+        # affect other metric evaluation.
+        logger.error(
+            "Metric evaluation failed for metric `%s` for eval case id '%s'"
+            ' with following error `%s`',
+            eval_metric.metric_name,
+            eval_case.eval_id,
+            e,
+            exc_info=True,
+        )
+        # We use an empty result.
+        evaluation_result = EvaluationResult(
+            overall_eval_status=EvalStatus.NOT_EVALUATED
+        )

      # Track overall scrore across all invocations.
      eval_metric_result_details = EvalMetricResultDetails(
@@ -299,8 +280,10 @@ class LocalEvalService(BaseEvalService):
          )
      )

-      if len(evaluation_result.per_invocation_results) != len(
-          eval_metric_result_per_invocation
+      if (
+          evaluation_result.overall_eval_status != EvalStatus.NOT_EVALUATED
+          and len(evaluation_result.per_invocation_results)
+          != len(eval_metric_result_per_invocation)
      ):
        raise ValueError(
            'Eval metric should return results for each invocation. Found '
@@ -309,10 +292,14 @@ class LocalEvalService(BaseEvalService):
        )

      # Track score across individual invocations.
-      for invocation_result, invocation in zip(
-          evaluation_result.per_invocation_results,
-          eval_metric_result_per_invocation,
-      ):
+      for idx, invocation in enumerate(eval_metric_result_per_invocation):
+        invocation_result = (
+            evaluation_result.per_invocation_results[idx]
+            if evaluation_result.overall_eval_status != EvalStatus.NOT_EVALUATED
+            else PerInvocationResult(
+                actual_invocation=invocation.actual_invocation
+            )
+        )
        eval_metric_result_details = EvalMetricResultDetails(
            rubric_scores=invocation_result.rubric_scores
        )
@@ -351,7 +338,7 @@ class LocalEvalService(BaseEvalService):
      self,
      eval_metric: EvalMetric,
      actual_invocations: list[Invocation],
-      expected_invocations: list[Invocation],
+      expected_invocations: Optional[list[Invocation]],
  ) -> EvaluationResult:
    """Returns EvaluationResult obtained from evaluating a metric using an Evaluator."""

@@ -100,7 +100,7 @@ class ResponseEvaluator(Evaluator):
  def evaluate_invocations(
      self,
      actual_invocations: list[Invocation],
-      expected_invocations: list[Invocation],
+      expected_invocations: Optional[list[Invocation]],
  ) -> EvaluationResult:
    # If the metric is response_match_score, just use the RougeEvaluator.
    if self._metric_name == PrebuiltMetrics.RESPONSE_MATCH_SCORE.value:
@@ -112,5 +112,7 @@ class ResponseEvaluator(Evaluator):
      )

    return _VertexAiEvalFacade(
-        threshold=self._threshold, metric_name=self._metric_name
+        threshold=self._threshold,
+        metric_name=self._metric_name,
+        expected_invocations_required=True,
    ).evaluate_invocations(actual_invocations, expected_invocations)
@@ -16,6 +16,7 @@ from __future__ import annotations

 import logging
 from typing import ClassVar
+from typing import Optional

 from typing_extensions import override

@@ -281,7 +282,7 @@ class RubricBasedFinalResponseQualityV1Evaluator(RubricBasedEvaluator):

  @override
  def format_auto_rater_prompt(
-      self, actual_invocation: Invocation, _: Invocation
+      self, actual_invocation: Invocation, _: Optional[Invocation]
  ) -> str:
    """Returns the autorater prompt."""

@@ -16,6 +16,7 @@ from __future__ import annotations

 import logging
 from typing import ClassVar
+from typing import Optional

 from typing_extensions import override

@@ -181,7 +182,7 @@ class RubricBasedToolUseV1Evaluator(RubricBasedEvaluator):

  @override
  def format_auto_rater_prompt(
-      self, actual_invocation: Invocation, _: Invocation
+      self, actual_invocation: Invocation, _: Optional[Invocation]
  ) -> str:
    """Returns the autorater prompt."""

@@ -14,6 +14,8 @@

 from __future__ import annotations

+from typing import Optional
+
 from typing_extensions import override

 from ..dependencies.vertexai import vertexai
@@ -66,7 +68,7 @@ class SafetyEvaluatorV1(Evaluator):
  def evaluate_invocations(
      self,
      actual_invocations: list[Invocation],
-      expected_invocations: list[Invocation],
+      expected_invocations: Optional[list[Invocation]],
  ) -> EvaluationResult:
    return _VertexAiEvalFacade(
        threshold=self._eval_metric.threshold,
@@ -71,9 +71,12 @@ class TrajectoryEvaluator(Evaluator):
  def evaluate_invocations(
      self,
      actual_invocations: list[Invocation],
-      expected_invocations: list[Invocation],
+      expected_invocations: Optional[list[Invocation]],
  ) -> EvaluationResult:
    """Returns EvaluationResult after performing evaluations using actual and expected invocations."""
+    if expected_invocations is None:
+      raise ValueError("expected_invocations is needed by this metric.")
+
    total_tool_use_accuracy = 0.0
    num_invocations = 0
    per_invocation_results = []
@@ -55,23 +55,38 @@ class _VertexAiEvalFacade(Evaluator):
  """

  def __init__(
-      self, threshold: float, metric_name: vertexai_types.PrebuiltMetric
+      self,
+      threshold: float,
+      metric_name: vertexai_types.PrebuiltMetric,
+      expected_invocations_required=False,
  ):
    self._threshold = threshold
    self._metric_name = metric_name
+    self._expected_invocations_required = expected_invocations_required

  @override
  def evaluate_invocations(
      self,
      actual_invocations: list[Invocation],
-      expected_invocations: list[Invocation],
+      expected_invocations: Optional[list[Invocation]],
  ) -> EvaluationResult:
+    if self._expected_invocations_required and expected_invocations is None:
+      raise ValueError("expected_invocations is needed by this metric.")
+
+    # If expected_invocation are not required by the metric and if they are not
+    # supplied, we provide an a list of None.
+    expected_invocations = (
+        [None] * len(actual_invocations)
+        if expected_invocations is None
+        else expected_invocations
+    )
+
    total_score = 0.0
    num_invocations = 0
    per_invocation_results = []
    for actual, expected in zip(actual_invocations, expected_invocations):
-      prompt = self._get_text(expected.user_content)
-      reference = self._get_text(expected.final_response)
+      prompt = self._get_text(actual.user_content)
+      reference = self._get_text(expected.final_response) if expected else None
      response = self._get_text(actual.final_response)
      eval_case = {
          "prompt": prompt,
@@ -16,6 +16,7 @@ from __future__ import annotations

 import asyncio
 import sys
+from typing import Optional

 from google.adk.agents.llm_agent import LlmAgent
 from google.adk.errors.not_found_error import NotFoundError
@@ -70,6 +71,10 @@ def eval_service(
  DEFAULT_METRIC_EVALUATOR_REGISTRY.register_evaluator(
      metric_info=FakeEvaluator.get_metric_info(), evaluator=FakeEvaluator
  )
+  DEFAULT_METRIC_EVALUATOR_REGISTRY.register_evaluator(
+      metric_info=FakeSingleSidedEvaluator.get_metric_info(),
+      evaluator=FakeSingleSidedEvaluator,
+  )
  return LocalEvalService(
      root_agent=dummy_agent,
      eval_sets_manager=mock_eval_sets_manager,
@@ -95,8 +100,10 @@ class FakeEvaluator(Evaluator):
  def evaluate_invocations(
      self,
      actual_invocations: list[Invocation],
-      expected_invocations: list[Invocation],
+      expected_invocations: Optional[list[Invocation]],
  ):
+    if expected_invocations is None:
+      raise ValueError("expected_invocations is required for this metric.")
    per_invocation_results = []
    for actual, expected in zip(actual_invocations, expected_invocations):
      per_invocation_results.append(
@@ -114,6 +121,42 @@ class FakeEvaluator(Evaluator):
    )


+class FakeSingleSidedEvaluator(Evaluator):
+
+  def __init__(self, eval_metric: EvalMetric):
+    self._eval_metric = eval_metric
+
+  @staticmethod
+  def get_metric_info() -> MetricInfo:
+    return MetricInfo(
+        metric_name="fake_single_sided_metric",
+        description="Fake single sided metric description",
+        metric_value_info=MetricValueInfo(
+            interval=Interval(min_value=0.0, max_value=1.0)
+        ),
+    )
+
+  def evaluate_invocations(
+      self,
+      actual_invocations: list[Invocation],
+      expected_invocations: Optional[list[Invocation]],
+  ):
+    per_invocation_results = []
+    for actual in actual_invocations:
+      per_invocation_results.append(
+          PerInvocationResult(
+              actual_invocation=actual,
+              score=0.995,
+              eval_status=EvalStatus.PASSED,
+          )
+      )
+    return EvaluationResult(
+        overall_score=0.95,
+        overall_eval_status=EvalStatus.PASSED,
+        per_invocation_results=per_invocation_results,
+    )
+
+
@pytest.mark.asyncio
 async def test_perform_inference_success(
    eval_service,
@@ -224,19 +267,27 @@ async def test_perform_inference_eval_set_not_found(
 async def test_evaluate_success(
    eval_service, mock_eval_sets_manager, mock_eval_set_results_manager, mocker
 ):
+  invocation = Invocation(
+      user_content=genai_types.Content(
+          parts=[genai_types.Part(text="test user content.")]
+      ),
+      final_response=genai_types.Content(
+          parts=[genai_types.Part(text="test final response.")]
+      ),
+  )
  inference_results = [
      InferenceResult(
          app_name="test_app",
          eval_set_id="test_eval_set",
          eval_case_id="case1",
-          inferences=[],
+          inferences=[invocation.model_copy(deep=True)],
          session_id="session1",
      ),
      InferenceResult(
          app_name="test_app",
          eval_set_id="test_eval_set",
          eval_case_id="case2",
-          inferences=[],
+          inferences=[invocation.model_copy(deep=True)],
          session_id="session2",
      ),
  ]
@@ -247,7 +298,7 @@ async def test_evaluate_success(
  )

  mock_eval_case = mocker.MagicMock(spec=EvalCase)
-  mock_eval_case.conversation = []
+  mock_eval_case.conversation = [invocation.model_copy(deep=True)]
  mock_eval_case.conversation_scenario = None
  mock_eval_case.session_input = None
  mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case
@@ -357,7 +408,7 @@ async def test_evaluate_single_inference_result(


@pytest.mark.asyncio
-async def test_evaluate_single_inference_result_skipped_for_conversation_scenario(
+async def test_evaluate_single_inference_result_for_conversation_scenario(
    eval_service, mock_eval_sets_manager, mocker
 ):
  """To be removed once evaluation is implemented for conversation scenarios."""
@@ -373,10 +424,16 @@ async def test_evaluate_single_inference_result_skipped_for_conversation_scenari
      app_name="test_app",
      eval_set_id="test_eval_set",
      eval_case_id="case1",
-      inferences=[invocation.model_copy(deep=True)],
+      inferences=[
+          invocation.model_copy(deep=True),
+          invocation.model_copy(deep=True),
+          invocation.model_copy(deep=True),
+      ],
      session_id="session1",
  )
-  eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5)
+  eval_metric = EvalMetric(
+      metric_name="fake_single_sided_metric", threshold=0.5
+  )
  evaluate_config = EvaluateConfig(eval_metrics=[eval_metric], parallelism=1)

  mock_eval_case = mocker.MagicMock(spec=EvalCase)
@@ -390,15 +447,77 @@ async def test_evaluate_single_inference_result_skipped_for_conversation_scenari
  )
  assert isinstance(result, EvalCaseResult)
  assert result.eval_id == "case1"
-  assert result.final_eval_status == EvalStatus.NOT_EVALUATED
-  assert not result.overall_eval_metric_results
-  assert len(result.eval_metric_result_per_invocation) == 1
-  invocation_result = result.eval_metric_result_per_invocation[0]
-  assert not invocation_result.eval_metric_results
+  assert result.final_eval_status == EvalStatus.PASSED
+  assert len(result.overall_eval_metric_results) == 1
  assert (
-      invocation_result.expected_invocation.final_response.parts[0].text
-      == "N/A"
+      result.overall_eval_metric_results[0].metric_name
+      == "fake_single_sided_metric"
  )
+  assert result.overall_eval_metric_results[0].score == 0.95
+  mock_eval_sets_manager.get_eval_case.assert_called_once_with(
+      app_name="test_app", eval_set_id="test_eval_set", eval_case_id="case1"
+  )
+
+  assert len(result.eval_metric_result_per_invocation) == 3
+  for i in range(3):
+    invocation_result = result.eval_metric_result_per_invocation[i]
+    assert invocation_result.actual_invocation == inference_result.inferences[i]
+    assert invocation_result.expected_invocation == None
+    assert len(invocation_result.eval_metric_results) == 1
+    metric_result = invocation_result.eval_metric_results[0]
+    assert metric_result.metric_name == "fake_single_sided_metric"
+    assert metric_result.score == 0.995
+    assert metric_result.eval_status == EvalStatus.PASSED
+
+
+@pytest.mark.asyncio
+async def test_evaluate_single_inference_result_for_conversation_scenario_with_unsupported_metric(
+    eval_service, mock_eval_sets_manager, mocker
+):
+  """To be removed once evaluation is implemented for conversation scenarios."""
+  invocation = Invocation(
+      user_content=genai_types.Content(
+          parts=[genai_types.Part(text="test user content.")]
+      ),
+      final_response=genai_types.Content(
+          parts=[genai_types.Part(text="test final response.")]
+      ),
+  )
+  inference_result = InferenceResult(
+      app_name="test_app",
+      eval_set_id="test_eval_set",
+      eval_case_id="case1",
+      inferences=[
+          invocation.model_copy(deep=True),
+          invocation.model_copy(deep=True),
+          invocation.model_copy(deep=True),
+      ],
+      session_id="session1",
+  )
+  eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5)
+  evaluate_config = EvaluateConfig(eval_metrics=[eval_metric], parallelism=1)
+
+  mock_eval_case = mocker.MagicMock(spec=EvalCase)
+  mock_eval_case.eval_id = "case1"
+  mock_eval_case.conversation = None
+  mock_eval_case.conversation_scenario = mocker.MagicMock()
+  mock_eval_case.session_input = None
+  mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case
+
+  _, result = await eval_service._evaluate_single_inference_result(
+      inference_result=inference_result, evaluate_config=evaluate_config
+  )
+  assert isinstance(result, EvalCaseResult)
+  assert result.eval_id == "case1"
+  assert result.final_eval_status == EvalStatus.NOT_EVALUATED
+  assert len(result.overall_eval_metric_results) == 1
+  assert result.overall_eval_metric_results[0].metric_name == "fake_metric"
+  assert result.overall_eval_metric_results[0].score is None
+  mock_eval_sets_manager.get_eval_case.assert_called_once_with(
+      app_name="test_app", eval_set_id="test_eval_set", eval_case_id="case1"
+  )
+
+  assert len(result.eval_metric_result_per_invocation) == 3


 def test_generate_final_eval_status_doesn_t_throw_on(eval_service):
@@ -424,9 +543,11 @@ async def test_mcp_stdio_agent_no_runtime_error(mocker):
  """Test that LocalEvalService can handle MCP stdio agents without RuntimeError.

  This is a regression test for GitHub issue #2196:
-  "RuntimeError: Attempted to exit cancel scope in a different task than it was entered in"
+  "RuntimeError: Attempted to exit cancel scope in a different task than it was
+  entered in"

-  The fix ensures that Runner.close() is called to properly cleanup MCP connections.
+  The fix ensures that Runner.close() is called to properly cleanup MCP
+  connections.
  """
  import tempfile