You've already forked adk-python
mirror of
https://github.com/encounter/adk-python.git
synced 2026-03-30 10:57:20 -07:00
* fixed CR comments * formatted via isort --------- Co-authored-by: Ankur <ankusharma@google.com>
This commit is contained in:
@@ -268,6 +268,22 @@ class LocalEvalService(BaseEvalService):
|
||||
else 'test_user_id'
|
||||
)
|
||||
|
||||
if (
|
||||
inference_result.status == InferenceStatus.FAILURE
|
||||
or inference_result.inferences is None
|
||||
):
|
||||
logger.error(
|
||||
'Evaluation attempted on failed inference for eval case `%s`.'
|
||||
' Error: %s',
|
||||
inference_result.eval_case_id,
|
||||
inference_result.error_message,
|
||||
)
|
||||
eval_case_result = await self._build_not_evaluated_eval_case_result(
|
||||
inference_result=inference_result,
|
||||
user_id=user_id,
|
||||
)
|
||||
return (inference_result, eval_case_result)
|
||||
|
||||
if eval_case.conversation_scenario is None and len(
|
||||
inference_result.inferences
|
||||
) != len(eval_case.conversation):
|
||||
@@ -464,6 +480,31 @@ class LocalEvalService(BaseEvalService):
|
||||
|
||||
return final_eval_status
|
||||
|
||||
async def _build_not_evaluated_eval_case_result(
|
||||
self,
|
||||
*,
|
||||
inference_result: InferenceResult,
|
||||
user_id: str,
|
||||
) -> EvalCaseResult:
|
||||
"""Constructs an EvalCaseResult for cases that could not be evaluated."""
|
||||
session_details = await self._session_service.get_session(
|
||||
app_name=inference_result.app_name,
|
||||
user_id=user_id,
|
||||
session_id=inference_result.session_id,
|
||||
)
|
||||
|
||||
return EvalCaseResult(
|
||||
eval_set_file=inference_result.eval_set_id,
|
||||
eval_set_id=inference_result.eval_set_id,
|
||||
eval_id=inference_result.eval_case_id,
|
||||
final_eval_status=EvalStatus.NOT_EVALUATED,
|
||||
overall_eval_metric_results=[],
|
||||
eval_metric_result_per_invocation=[],
|
||||
session_id=inference_result.session_id,
|
||||
session_details=session_details,
|
||||
user_id=user_id,
|
||||
)
|
||||
|
||||
async def _perform_inference_single_eval_item(
|
||||
self,
|
||||
app_name: str,
|
||||
|
||||
@@ -325,6 +325,82 @@ async def test_evaluate_success(
|
||||
assert mock_eval_set_results_manager.save_eval_set_result.call_count == 2
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluate_skips_failed_inference_results(
|
||||
eval_service, mock_eval_sets_manager, mock_eval_set_results_manager, mocker
|
||||
):
|
||||
invocation = Invocation(
|
||||
user_content=genai_types.Content(
|
||||
parts=[genai_types.Part(text="test user content.")]
|
||||
),
|
||||
final_response=genai_types.Content(
|
||||
parts=[genai_types.Part(text="test final response.")]
|
||||
),
|
||||
)
|
||||
inference_results = [
|
||||
InferenceResult(
|
||||
app_name="test_app",
|
||||
eval_set_id="test_eval_set",
|
||||
eval_case_id="case_failure",
|
||||
inferences=None,
|
||||
session_id="session_fail",
|
||||
status=InferenceStatus.FAILURE,
|
||||
error_message="simulated failure",
|
||||
),
|
||||
InferenceResult(
|
||||
app_name="test_app",
|
||||
eval_set_id="test_eval_set",
|
||||
eval_case_id="case_success",
|
||||
inferences=[invocation.model_copy(deep=True)],
|
||||
session_id="session_success",
|
||||
status=InferenceStatus.SUCCESS,
|
||||
),
|
||||
InferenceResult(
|
||||
app_name="test_app",
|
||||
eval_set_id="test_eval_set",
|
||||
eval_case_id="case_unknown",
|
||||
inferences=[invocation.model_copy(deep=True)],
|
||||
session_id="session_unknown",
|
||||
status=InferenceStatus.UNKNOWN,
|
||||
),
|
||||
]
|
||||
eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5)
|
||||
evaluate_request = EvaluateRequest(
|
||||
inference_results=inference_results,
|
||||
evaluate_config=EvaluateConfig(eval_metrics=[eval_metric], parallelism=2),
|
||||
)
|
||||
|
||||
mock_eval_case = mocker.MagicMock(spec=EvalCase)
|
||||
mock_eval_case.conversation = [invocation.model_copy(deep=True)]
|
||||
mock_eval_case.conversation_scenario = None
|
||||
mock_eval_case.session_input = None
|
||||
mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case
|
||||
|
||||
results = []
|
||||
async for result in eval_service.evaluate(evaluate_request):
|
||||
results.append(result)
|
||||
|
||||
assert len(results) == 3
|
||||
results_by_case = {result.eval_id: result for result in results}
|
||||
|
||||
failure_result = results_by_case["case_failure"]
|
||||
assert failure_result.final_eval_status == EvalStatus.NOT_EVALUATED
|
||||
assert failure_result.overall_eval_metric_results == []
|
||||
assert failure_result.eval_metric_result_per_invocation == []
|
||||
|
||||
for case_id in ["case_success", "case_unknown"]:
|
||||
case_result = results_by_case[case_id]
|
||||
assert case_result.final_eval_status == EvalStatus.PASSED
|
||||
assert len(case_result.overall_eval_metric_results) == 1
|
||||
assert (
|
||||
case_result.overall_eval_metric_results[0].metric_name == "fake_metric"
|
||||
)
|
||||
assert case_result.overall_eval_metric_results[0].score == 0.9
|
||||
|
||||
assert mock_eval_sets_manager.get_eval_case.call_count == 3
|
||||
assert mock_eval_set_results_manager.save_eval_set_result.call_count == 3
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluate_eval_case_not_found(
|
||||
eval_service,
|
||||
@@ -418,6 +494,93 @@ async def test_evaluate_single_inference_result(
|
||||
assert metric_result.eval_status == EvalStatus.PASSED
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluate_single_inference_result_handles_failed_inference(
|
||||
eval_service, mock_eval_sets_manager, mocker
|
||||
):
|
||||
invocation = Invocation(
|
||||
user_content=genai_types.Content(
|
||||
parts=[genai_types.Part(text="test user content.")]
|
||||
),
|
||||
final_response=genai_types.Content(
|
||||
parts=[genai_types.Part(text="test final response.")]
|
||||
),
|
||||
)
|
||||
inference_result = InferenceResult(
|
||||
app_name="test_app",
|
||||
eval_set_id="test_eval_set",
|
||||
eval_case_id="case1",
|
||||
inferences=None,
|
||||
session_id="session1",
|
||||
status=InferenceStatus.FAILURE,
|
||||
error_message="simulated inference failure",
|
||||
)
|
||||
eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5)
|
||||
evaluate_config = EvaluateConfig(eval_metrics=[eval_metric], parallelism=1)
|
||||
|
||||
mock_eval_case = mocker.MagicMock(spec=EvalCase)
|
||||
mock_eval_case.conversation = [invocation.model_copy(deep=True)]
|
||||
mock_eval_case.conversation_scenario = None
|
||||
mock_eval_case.session_input = None
|
||||
mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case
|
||||
|
||||
_, result = await eval_service._evaluate_single_inference_result(
|
||||
inference_result=inference_result, evaluate_config=evaluate_config
|
||||
)
|
||||
|
||||
assert isinstance(result, EvalCaseResult)
|
||||
assert result.eval_id == "case1"
|
||||
assert result.final_eval_status == EvalStatus.NOT_EVALUATED
|
||||
assert result.overall_eval_metric_results == []
|
||||
assert result.eval_metric_result_per_invocation == []
|
||||
mock_eval_sets_manager.get_eval_case.assert_called_once_with(
|
||||
app_name="test_app", eval_set_id="test_eval_set", eval_case_id="case1"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluate_single_inference_result_handles_missing_inferences(
|
||||
eval_service, mock_eval_sets_manager, mocker
|
||||
):
|
||||
invocation = Invocation(
|
||||
user_content=genai_types.Content(
|
||||
parts=[genai_types.Part(text="test user content.")]
|
||||
),
|
||||
final_response=genai_types.Content(
|
||||
parts=[genai_types.Part(text="test final response.")]
|
||||
),
|
||||
)
|
||||
inference_result = InferenceResult(
|
||||
app_name="test_app",
|
||||
eval_set_id="test_eval_set",
|
||||
eval_case_id="case1",
|
||||
inferences=None,
|
||||
session_id="session1",
|
||||
status=InferenceStatus.SUCCESS,
|
||||
)
|
||||
eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5)
|
||||
evaluate_config = EvaluateConfig(eval_metrics=[eval_metric], parallelism=1)
|
||||
|
||||
mock_eval_case = mocker.MagicMock(spec=EvalCase)
|
||||
mock_eval_case.conversation = [invocation.model_copy(deep=True)]
|
||||
mock_eval_case.conversation_scenario = None
|
||||
mock_eval_case.session_input = None
|
||||
mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case
|
||||
|
||||
_, result = await eval_service._evaluate_single_inference_result(
|
||||
inference_result=inference_result, evaluate_config=evaluate_config
|
||||
)
|
||||
|
||||
assert isinstance(result, EvalCaseResult)
|
||||
assert result.eval_id == "case1"
|
||||
assert result.final_eval_status == EvalStatus.NOT_EVALUATED
|
||||
assert result.overall_eval_metric_results == []
|
||||
assert result.eval_metric_result_per_invocation == []
|
||||
mock_eval_sets_manager.get_eval_case.assert_called_once_with(
|
||||
app_name="test_app", eval_set_id="test_eval_set", eval_case_id="case1"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluate_single_inference_result_for_conversation_scenario(
|
||||
eval_service, mock_eval_sets_manager, mocker
|
||||
|
||||
Reference in New Issue
Block a user