fix: Overall eval status should be NOT_EVALUATED if no invocations were evaluated

PiperOrigin-RevId: 819322513
This commit is contained in:
Joseph Pagadora
2025-10-14 11:35:53 -07:00
committed by Copybara-Service
parent bae21027d9
commit 9fbed0b15a
2 changed files with 68 additions and 1 deletions
@@ -694,7 +694,7 @@ class HallucinationsV1Evaluator(Evaluator):
if not valid_results:
return EvaluationResult(
overall_score=None,
overall_eval_status=EvalStatus.FAILED,
overall_eval_status=EvalStatus.NOT_EVALUATED,
per_invocation_results=per_invocation_results,
)
@@ -1445,6 +1445,73 @@ async def test_evaluate_invocations_no_nl_response(hallucinations_metric):
assert per_invocation_result.eval_status == EvalStatus.NOT_EVALUATED
@pytest.mark.asyncio
async def test_evaluate_all_invocations_not_evaluated(hallucinations_metric):
metric = hallucinations_metric
app_details = AppDetails(
agent_details={
"root": AgentDetails(
name="root",
instructions="Root agent instructions.",
tool_declarations=[],
),
},
)
user_content = genai_types.Content(
parts=[genai_types.Part(text="User query.")]
)
actual_invocation = Invocation(
app_details=app_details,
user_content=user_content,
intermediate_data=InvocationEvents(
invocation_events=[
InvocationEvent(
author="root",
content=genai_types.Content(
parts=[
genai_types.Part(text="Intermediate NL response."),
]
),
),
]
),
final_response=genai_types.Content(
parts=[genai_types.Part(text="Final response.")]
),
)
expected_invocation = Invocation(
app_details=app_details,
user_content=user_content,
final_response=genai_types.Content(
parts=[genai_types.Part(text="Final response.")]
),
)
async def mock_evaluate_nl_response(nl_response, context):
return None, "Judge model error."
with patch(
"google.adk.evaluation.hallucinations_v1.HallucinationsV1Evaluator._evaluate_nl_response",
side_effect=mock_evaluate_nl_response,
):
result = await metric.evaluate_invocations(
[actual_invocation, actual_invocation],
[expected_invocation, expected_invocation],
)
assert len(result.per_invocation_results) == 2
assert result.per_invocation_results[0].score is None
assert (
result.per_invocation_results[0].eval_status == EvalStatus.NOT_EVALUATED
)
assert result.per_invocation_results[1].score is None
assert (
result.per_invocation_results[1].eval_status == EvalStatus.NOT_EVALUATED
)
assert result.overall_score is None
assert result.overall_eval_status == EvalStatus.NOT_EVALUATED
@pytest.mark.asyncio
async def test_evaluate_invocations_partial_failure(hallucinations_metric):
metric = hallucinations_metric