From 9fbed0b15afb94ec8c0c7ab60221bbc97e481b06 Mon Sep 17 00:00:00 2001
From: Joseph Pagadora <jcpagadora@google.com>
Date: Tue, 14 Oct 2025 11:35:53 -0700
Subject: [PATCH] fix: Overall eval status should be NOT_EVALUATED if no
 invocations were evaluated

PiperOrigin-RevId: 819322513
---
 .../adk/evaluation/hallucinations_v1.py       |  2 +-
 .../evaluation/test_hallucinations_v1.py      | 67 +++++++++++++++++++
 2 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/src/google/adk/evaluation/hallucinations_v1.py b/src/google/adk/evaluation/hallucinations_v1.py
index ad9162f8..15cda35d 100644
--- a/src/google/adk/evaluation/hallucinations_v1.py
+++ b/src/google/adk/evaluation/hallucinations_v1.py
@@ -694,7 +694,7 @@ class HallucinationsV1Evaluator(Evaluator):
     if not valid_results:
       return EvaluationResult(
           overall_score=None,
-          overall_eval_status=EvalStatus.FAILED,
+          overall_eval_status=EvalStatus.NOT_EVALUATED,
           per_invocation_results=per_invocation_results,
       )
 
diff --git a/tests/unittests/evaluation/test_hallucinations_v1.py b/tests/unittests/evaluation/test_hallucinations_v1.py
index 04742441..d74cb24a 100644
--- a/tests/unittests/evaluation/test_hallucinations_v1.py
+++ b/tests/unittests/evaluation/test_hallucinations_v1.py
@@ -1445,6 +1445,73 @@ async def test_evaluate_invocations_no_nl_response(hallucinations_metric):
   assert per_invocation_result.eval_status == EvalStatus.NOT_EVALUATED
 
 
+@pytest.mark.asyncio
+async def test_evaluate_all_invocations_not_evaluated(hallucinations_metric):
+  metric = hallucinations_metric
+  app_details = AppDetails(
+      agent_details={
+          "root": AgentDetails(
+              name="root",
+              instructions="Root agent instructions.",
+              tool_declarations=[],
+          ),
+      },
+  )
+  user_content = genai_types.Content(
+      parts=[genai_types.Part(text="User query.")]
+  )
+  actual_invocation = Invocation(
+      app_details=app_details,
+      user_content=user_content,
+      intermediate_data=InvocationEvents(
+          invocation_events=[
+              InvocationEvent(
+                  author="root",
+                  content=genai_types.Content(
+                      parts=[
+                          genai_types.Part(text="Intermediate NL response."),
+                      ]
+                  ),
+              ),
+          ]
+      ),
+      final_response=genai_types.Content(
+          parts=[genai_types.Part(text="Final response.")]
+      ),
+  )
+  expected_invocation = Invocation(
+      app_details=app_details,
+      user_content=user_content,
+      final_response=genai_types.Content(
+          parts=[genai_types.Part(text="Final response.")]
+      ),
+  )
+
+  async def mock_evaluate_nl_response(nl_response, context):
+    return None, "Judge model error."
+
+  with patch(
+      "google.adk.evaluation.hallucinations_v1.HallucinationsV1Evaluator._evaluate_nl_response",
+      side_effect=mock_evaluate_nl_response,
+  ):
+    result = await metric.evaluate_invocations(
+        [actual_invocation, actual_invocation],
+        [expected_invocation, expected_invocation],
+    )
+
+    assert len(result.per_invocation_results) == 2
+    assert result.per_invocation_results[0].score is None
+    assert (
+        result.per_invocation_results[0].eval_status == EvalStatus.NOT_EVALUATED
+    )
+    assert result.per_invocation_results[1].score is None
+    assert (
+        result.per_invocation_results[1].eval_status == EvalStatus.NOT_EVALUATED
+    )
+    assert result.overall_score is None
+    assert result.overall_eval_status == EvalStatus.NOT_EVALUATED
+
+
 @pytest.mark.asyncio
 async def test_evaluate_invocations_partial_failure(hallucinations_metric):
   metric = hallucinations_metric