From 9fbed0b15afb94ec8c0c7ab60221bbc97e481b06 Mon Sep 17 00:00:00 2001 From: Joseph Pagadora Date: Tue, 14 Oct 2025 11:35:53 -0700 Subject: [PATCH] fix: Overall eval status should be NOT_EVALUATED if no invocations were evaluated PiperOrigin-RevId: 819322513 --- .../adk/evaluation/hallucinations_v1.py | 2 +- .../evaluation/test_hallucinations_v1.py | 67 +++++++++++++++++++ 2 files changed, 68 insertions(+), 1 deletion(-) diff --git a/src/google/adk/evaluation/hallucinations_v1.py b/src/google/adk/evaluation/hallucinations_v1.py index ad9162f8..15cda35d 100644 --- a/src/google/adk/evaluation/hallucinations_v1.py +++ b/src/google/adk/evaluation/hallucinations_v1.py @@ -694,7 +694,7 @@ class HallucinationsV1Evaluator(Evaluator): if not valid_results: return EvaluationResult( overall_score=None, - overall_eval_status=EvalStatus.FAILED, + overall_eval_status=EvalStatus.NOT_EVALUATED, per_invocation_results=per_invocation_results, ) diff --git a/tests/unittests/evaluation/test_hallucinations_v1.py b/tests/unittests/evaluation/test_hallucinations_v1.py index 04742441..d74cb24a 100644 --- a/tests/unittests/evaluation/test_hallucinations_v1.py +++ b/tests/unittests/evaluation/test_hallucinations_v1.py @@ -1445,6 +1445,73 @@ async def test_evaluate_invocations_no_nl_response(hallucinations_metric): assert per_invocation_result.eval_status == EvalStatus.NOT_EVALUATED +@pytest.mark.asyncio +async def test_evaluate_all_invocations_not_evaluated(hallucinations_metric): + metric = hallucinations_metric + app_details = AppDetails( + agent_details={ + "root": AgentDetails( + name="root", + instructions="Root agent instructions.", + tool_declarations=[], + ), + }, + ) + user_content = genai_types.Content( + parts=[genai_types.Part(text="User query.")] + ) + actual_invocation = Invocation( + app_details=app_details, + user_content=user_content, + intermediate_data=InvocationEvents( + invocation_events=[ + InvocationEvent( + author="root", + content=genai_types.Content( + parts=[ + genai_types.Part(text="Intermediate NL response."), + ] + ), + ), + ] + ), + final_response=genai_types.Content( + parts=[genai_types.Part(text="Final response.")] + ), + ) + expected_invocation = Invocation( + app_details=app_details, + user_content=user_content, + final_response=genai_types.Content( + parts=[genai_types.Part(text="Final response.")] + ), + ) + + async def mock_evaluate_nl_response(nl_response, context): + return None, "Judge model error." + + with patch( + "google.adk.evaluation.hallucinations_v1.HallucinationsV1Evaluator._evaluate_nl_response", + side_effect=mock_evaluate_nl_response, + ): + result = await metric.evaluate_invocations( + [actual_invocation, actual_invocation], + [expected_invocation, expected_invocation], + ) + + assert len(result.per_invocation_results) == 2 + assert result.per_invocation_results[0].score is None + assert ( + result.per_invocation_results[0].eval_status == EvalStatus.NOT_EVALUATED + ) + assert result.per_invocation_results[1].score is None + assert ( + result.per_invocation_results[1].eval_status == EvalStatus.NOT_EVALUATED + ) + assert result.overall_score is None + assert result.overall_eval_status == EvalStatus.NOT_EVALUATED + + @pytest.mark.asyncio async def test_evaluate_invocations_partial_failure(hallucinations_metric): metric = hallucinations_metric