fix: Overall eval status should be NOT_EVALUATED if no invocations were evaluated

PiperOrigin-RevId: 819322513
2026-03-30 10:57:20 -07:00 · 2025-10-14 11:35:53 -07:00
parent bae21027d9
commit 9fbed0b15a
2 changed files with 68 additions and 1 deletions
@@ -694,7 +694,7 @@ class HallucinationsV1Evaluator(Evaluator):
    if not valid_results:
      return EvaluationResult(
          overall_score=None,
-          overall_eval_status=EvalStatus.FAILED,
+          overall_eval_status=EvalStatus.NOT_EVALUATED,
          per_invocation_results=per_invocation_results,
      )

@@ -1445,6 +1445,73 @@ async def test_evaluate_invocations_no_nl_response(hallucinations_metric):
  assert per_invocation_result.eval_status == EvalStatus.NOT_EVALUATED


+@pytest.mark.asyncio
+async def test_evaluate_all_invocations_not_evaluated(hallucinations_metric):
+  metric = hallucinations_metric
+  app_details = AppDetails(
+      agent_details={
+          "root": AgentDetails(
+              name="root",
+              instructions="Root agent instructions.",
+              tool_declarations=[],
+          ),
+      },
+  )
+  user_content = genai_types.Content(
+      parts=[genai_types.Part(text="User query.")]
+  )
+  actual_invocation = Invocation(
+      app_details=app_details,
+      user_content=user_content,
+      intermediate_data=InvocationEvents(
+          invocation_events=[
+              InvocationEvent(
+                  author="root",
+                  content=genai_types.Content(
+                      parts=[
+                          genai_types.Part(text="Intermediate NL response."),
+                      ]
+                  ),
+              ),
+          ]
+      ),
+      final_response=genai_types.Content(
+          parts=[genai_types.Part(text="Final response.")]
+      ),
+  )
+  expected_invocation = Invocation(
+      app_details=app_details,
+      user_content=user_content,
+      final_response=genai_types.Content(
+          parts=[genai_types.Part(text="Final response.")]
+      ),
+  )
+
+  async def mock_evaluate_nl_response(nl_response, context):
+    return None, "Judge model error."
+
+  with patch(
+      "google.adk.evaluation.hallucinations_v1.HallucinationsV1Evaluator._evaluate_nl_response",
+      side_effect=mock_evaluate_nl_response,
+  ):
+    result = await metric.evaluate_invocations(
+        [actual_invocation, actual_invocation],
+        [expected_invocation, expected_invocation],
+    )
+
+    assert len(result.per_invocation_results) == 2
+    assert result.per_invocation_results[0].score is None
+    assert (
+        result.per_invocation_results[0].eval_status == EvalStatus.NOT_EVALUATED
+    )
+    assert result.per_invocation_results[1].score is None
+    assert (
+        result.per_invocation_results[1].eval_status == EvalStatus.NOT_EVALUATED
+    )
+    assert result.overall_score is None
+    assert result.overall_eval_status == EvalStatus.NOT_EVALUATED
+
+
@pytest.mark.asyncio
 async def test_evaluate_invocations_partial_failure(hallucinations_metric):
  metric = hallucinations_metric