You've already forked adk-python
mirror of
https://github.com/encounter/adk-python.git
synced 2026-03-30 10:57:20 -07:00
fix: Overall eval status should be NOT_EVALUATED if no invocations were evaluated
PiperOrigin-RevId: 819322513
This commit is contained in:
committed by
Copybara-Service
parent
bae21027d9
commit
9fbed0b15a
@@ -694,7 +694,7 @@ class HallucinationsV1Evaluator(Evaluator):
|
||||
if not valid_results:
|
||||
return EvaluationResult(
|
||||
overall_score=None,
|
||||
overall_eval_status=EvalStatus.FAILED,
|
||||
overall_eval_status=EvalStatus.NOT_EVALUATED,
|
||||
per_invocation_results=per_invocation_results,
|
||||
)
|
||||
|
||||
|
||||
@@ -1445,6 +1445,73 @@ async def test_evaluate_invocations_no_nl_response(hallucinations_metric):
|
||||
assert per_invocation_result.eval_status == EvalStatus.NOT_EVALUATED
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluate_all_invocations_not_evaluated(hallucinations_metric):
|
||||
metric = hallucinations_metric
|
||||
app_details = AppDetails(
|
||||
agent_details={
|
||||
"root": AgentDetails(
|
||||
name="root",
|
||||
instructions="Root agent instructions.",
|
||||
tool_declarations=[],
|
||||
),
|
||||
},
|
||||
)
|
||||
user_content = genai_types.Content(
|
||||
parts=[genai_types.Part(text="User query.")]
|
||||
)
|
||||
actual_invocation = Invocation(
|
||||
app_details=app_details,
|
||||
user_content=user_content,
|
||||
intermediate_data=InvocationEvents(
|
||||
invocation_events=[
|
||||
InvocationEvent(
|
||||
author="root",
|
||||
content=genai_types.Content(
|
||||
parts=[
|
||||
genai_types.Part(text="Intermediate NL response."),
|
||||
]
|
||||
),
|
||||
),
|
||||
]
|
||||
),
|
||||
final_response=genai_types.Content(
|
||||
parts=[genai_types.Part(text="Final response.")]
|
||||
),
|
||||
)
|
||||
expected_invocation = Invocation(
|
||||
app_details=app_details,
|
||||
user_content=user_content,
|
||||
final_response=genai_types.Content(
|
||||
parts=[genai_types.Part(text="Final response.")]
|
||||
),
|
||||
)
|
||||
|
||||
async def mock_evaluate_nl_response(nl_response, context):
|
||||
return None, "Judge model error."
|
||||
|
||||
with patch(
|
||||
"google.adk.evaluation.hallucinations_v1.HallucinationsV1Evaluator._evaluate_nl_response",
|
||||
side_effect=mock_evaluate_nl_response,
|
||||
):
|
||||
result = await metric.evaluate_invocations(
|
||||
[actual_invocation, actual_invocation],
|
||||
[expected_invocation, expected_invocation],
|
||||
)
|
||||
|
||||
assert len(result.per_invocation_results) == 2
|
||||
assert result.per_invocation_results[0].score is None
|
||||
assert (
|
||||
result.per_invocation_results[0].eval_status == EvalStatus.NOT_EVALUATED
|
||||
)
|
||||
assert result.per_invocation_results[1].score is None
|
||||
assert (
|
||||
result.per_invocation_results[1].eval_status == EvalStatus.NOT_EVALUATED
|
||||
)
|
||||
assert result.overall_score is None
|
||||
assert result.overall_eval_status == EvalStatus.NOT_EVALUATED
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluate_invocations_partial_failure(hallucinations_metric):
|
||||
metric = hallucinations_metric
|
||||
|
||||
Reference in New Issue
Block a user