You've already forked adk-python
mirror of
https://github.com/encounter/adk-python.git
synced 2026-03-30 10:57:20 -07:00
chore: Fix evaluation test cases to only use pytest features
PiperOrigin-RevId: 820700378
This commit is contained in:
committed by
Copybara-Service
parent
9dce06f9b0
commit
cf3403231d
@@ -14,8 +14,6 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest import mock
|
||||
|
||||
from google.adk.evaluation.eval_config import _DEFAULT_EVAL_CONFIG
|
||||
from google.adk.evaluation.eval_config import EvalConfig
|
||||
from google.adk.evaluation.eval_config import get_eval_metrics_from_config
|
||||
@@ -28,13 +26,14 @@ def test_get_evaluation_criteria_or_default_returns_default():
|
||||
assert get_evaluation_criteria_or_default("") == _DEFAULT_EVAL_CONFIG
|
||||
|
||||
|
||||
def test_get_evaluation_criteria_or_default_reads_from_file():
|
||||
def test_get_evaluation_criteria_or_default_reads_from_file(mocker):
|
||||
eval_config = EvalConfig(
|
||||
criteria={"tool_trajectory_avg_score": 0.5, "response_match_score": 0.5}
|
||||
)
|
||||
mock_open = mock.mock_open(read_data=eval_config.model_dump_json())
|
||||
with mock.patch("builtins.open", mock_open):
|
||||
assert get_evaluation_criteria_or_default("dummy_path") == eval_config
|
||||
mocker.patch(
|
||||
"builtins.open", mocker.mock_open(read_data=eval_config.model_dump_json())
|
||||
)
|
||||
assert get_evaluation_criteria_or_default("dummy_path") == eval_config
|
||||
|
||||
|
||||
def test_get_eval_metrics_from_config():
|
||||
|
||||
@@ -14,8 +14,6 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest import mock
|
||||
|
||||
from google.adk.evaluation.app_details import AgentDetails
|
||||
from google.adk.evaluation.app_details import AppDetails
|
||||
from google.adk.evaluation.evaluation_generator import EvaluationGenerator
|
||||
@@ -206,17 +204,17 @@ class TestConvertEventsToEvalInvocation:
|
||||
class TestGetAppDetailsByInvocationId:
|
||||
"""Test cases for EvaluationGenerator._get_app_details_by_invocation_id method."""
|
||||
|
||||
def test_get_app_details_by_invocation_id_empty(self):
|
||||
def test_get_app_details_by_invocation_id_empty(self, mocker):
|
||||
"""Tests with an empty list of events."""
|
||||
mock_request_intercepter = mock.MagicMock(spec=_RequestIntercepterPlugin)
|
||||
mock_request_intercepter = mocker.MagicMock(spec=_RequestIntercepterPlugin)
|
||||
app_details = EvaluationGenerator._get_app_details_by_invocation_id(
|
||||
[], mock_request_intercepter
|
||||
)
|
||||
assert app_details == {}
|
||||
|
||||
def test_get_app_details_by_invocation_id_no_model_requests(self):
|
||||
def test_get_app_details_by_invocation_id_no_model_requests(self, mocker):
|
||||
"""Tests when request_intercepter returns no model requests."""
|
||||
mock_request_intercepter = mock.MagicMock(spec=_RequestIntercepterPlugin)
|
||||
mock_request_intercepter = mocker.MagicMock(spec=_RequestIntercepterPlugin)
|
||||
mock_request_intercepter.get_model_request.return_value = None
|
||||
events = [
|
||||
_build_event("user", [types.Part(text="Hello")], "inv1"),
|
||||
@@ -230,9 +228,9 @@ class TestGetAppDetailsByInvocationId:
|
||||
events[1]
|
||||
)
|
||||
|
||||
def test_get_app_details_single_invocation_single_agent(self):
|
||||
def test_get_app_details_single_invocation_single_agent(self, mocker):
|
||||
"""Tests a single invocation with one agent."""
|
||||
mock_request_intercepter = mock.MagicMock(spec=_RequestIntercepterPlugin)
|
||||
mock_request_intercepter = mocker.MagicMock(spec=_RequestIntercepterPlugin)
|
||||
mock_llm_request = LlmRequest(model="test")
|
||||
mock_llm_request.config.system_instruction = "instruction1"
|
||||
mock_llm_request.config.tools = [types.Tool()]
|
||||
@@ -262,9 +260,9 @@ class TestGetAppDetailsByInvocationId:
|
||||
events[1]
|
||||
)
|
||||
|
||||
def test_get_app_details_multiple_invocations_multiple_agents(self):
|
||||
def test_get_app_details_multiple_invocations_multiple_agents(self, mocker):
|
||||
"""Tests multiple invocations with multiple agents."""
|
||||
mock_request_intercepter = mock.MagicMock(spec=_RequestIntercepterPlugin)
|
||||
mock_request_intercepter = mocker.MagicMock(spec=_RequestIntercepterPlugin)
|
||||
|
||||
def get_model_request_side_effect(event):
|
||||
mock_llm_request = LlmRequest(model="test")
|
||||
|
||||
@@ -13,9 +13,6 @@
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
from unittest.mock import AsyncMock
|
||||
from unittest.mock import MagicMock
|
||||
from unittest.mock import patch
|
||||
|
||||
from google.adk.evaluation.app_details import AgentDetails
|
||||
from google.adk.evaluation.app_details import AppDetails
|
||||
@@ -34,13 +31,13 @@ import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_llm_registry():
|
||||
def mock_llm_registry(mocker):
|
||||
"""Mocks LLMRegistry to avoid actual model loading during tests."""
|
||||
with patch(
|
||||
MockLLMRegistry = mocker.patch(
|
||||
"google.adk.evaluation.hallucinations_v1.LLMRegistry"
|
||||
) as MockLLMRegistry:
|
||||
MockLLMRegistry.return_value.resolve.return_value = AsyncMock
|
||||
yield
|
||||
)
|
||||
MockLLMRegistry.return_value.resolve.return_value = mocker.MagicMock()
|
||||
yield
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@@ -176,8 +173,8 @@ contradicting_excerpt: Pears are blue fruits
|
||||
class TestEvaluateNlResponse:
|
||||
"""Test cases for _evaluate_nl_response method."""
|
||||
|
||||
def _create_genai_response(self, text):
|
||||
response_mock = MagicMock()
|
||||
def _create_genai_response(self, text, mocker):
|
||||
response_mock = mocker.MagicMock()
|
||||
response_mock.content = genai_types.Content(
|
||||
parts=[genai_types.Part(text=text)]
|
||||
)
|
||||
@@ -185,12 +182,12 @@ class TestEvaluateNlResponse:
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluate_nl_response_unexpected_labels(
|
||||
self, hallucinations_metric
|
||||
self, hallucinations_metric, mocker
|
||||
):
|
||||
"""Tests _evaluate_nl_response with unexpected labels."""
|
||||
metric = hallucinations_metric
|
||||
seg_response = self._create_genai_response(
|
||||
"<sentence>sentence 1</sentence><sentence>sentence 2</sentence>"
|
||||
"<sentence>sentence 1</sentence><sentence>sentence 2</sentence>", mocker
|
||||
)
|
||||
val_response_text = """sentence: sentence 1
|
||||
label:
|
||||
@@ -204,7 +201,7 @@ rationale: r2
|
||||
supporting_excerpt: null
|
||||
contradicting_excerpt: null
|
||||
"""
|
||||
val_response = self._create_genai_response(val_response_text)
|
||||
val_response = self._create_genai_response(val_response_text, mocker)
|
||||
|
||||
async def seg_gen():
|
||||
yield seg_response
|
||||
@@ -212,7 +209,7 @@ contradicting_excerpt: null
|
||||
async def val_gen():
|
||||
yield val_response
|
||||
|
||||
metric._judge_model.generate_content_async = MagicMock(
|
||||
metric._judge_model.generate_content_async = mocker.MagicMock(
|
||||
side_effect=[
|
||||
seg_gen(),
|
||||
val_gen(),
|
||||
@@ -223,14 +220,14 @@ contradicting_excerpt: null
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluate_nl_response_missing_label(
|
||||
self, hallucinations_metric
|
||||
self, hallucinations_metric, mocker
|
||||
):
|
||||
"""Tests _evaluate_nl_response with missing labels in validation results."""
|
||||
metric = hallucinations_metric
|
||||
seg_response = self._create_genai_response(
|
||||
"<sentence>sentence 1</sentence>"
|
||||
"<sentence>sentence 1</sentence>", mocker
|
||||
)
|
||||
val_response = self._create_genai_response("val_response")
|
||||
val_response = self._create_genai_response("val_response", mocker)
|
||||
|
||||
async def seg_gen():
|
||||
yield seg_response
|
||||
@@ -238,7 +235,7 @@ contradicting_excerpt: null
|
||||
async def val_gen():
|
||||
yield val_response
|
||||
|
||||
metric._judge_model.generate_content_async = MagicMock(
|
||||
metric._judge_model.generate_content_async = mocker.MagicMock(
|
||||
side_effect=[
|
||||
seg_gen(),
|
||||
val_gen(),
|
||||
@@ -585,7 +582,7 @@ class TestEvaluateInvocationsAgentTree:
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluate_invocations_multi_agents(
|
||||
self, hallucinations_metric, agent_tree_data
|
||||
self, hallucinations_metric, agent_tree_data, mocker
|
||||
):
|
||||
"""Tests evaluate_invocations with agent tree and checks contexts."""
|
||||
invocation, expected_invocation = agent_tree_data
|
||||
@@ -804,22 +801,22 @@ Agent2 response.
|
||||
}])
|
||||
return None, "error"
|
||||
|
||||
with patch(
|
||||
mocker.patch(
|
||||
"google.adk.evaluation.hallucinations_v1.HallucinationsV1Evaluator._evaluate_nl_response",
|
||||
side_effect=mock_evaluate_nl_response,
|
||||
):
|
||||
result = await metric.evaluate_invocations(
|
||||
[invocation], [expected_invocation]
|
||||
)
|
||||
)
|
||||
result = await metric.evaluate_invocations(
|
||||
[invocation], [expected_invocation]
|
||||
)
|
||||
|
||||
assert result.overall_score == pytest.approx(0.5)
|
||||
assert len(result.per_invocation_results) == 1
|
||||
per_invocation_result = result.per_invocation_results[0]
|
||||
assert per_invocation_result.score == pytest.approx(0.5)
|
||||
assert result.overall_score == pytest.approx(0.5)
|
||||
assert len(result.per_invocation_results) == 1
|
||||
per_invocation_result = result.per_invocation_results[0]
|
||||
assert per_invocation_result.score == pytest.approx(0.5)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluate_invocations_agent_tree_skip_intermediate(
|
||||
self, mock_llm_registry, agent_tree_data
|
||||
self, mock_llm_registry, agent_tree_data, mocker
|
||||
):
|
||||
"""Tests evaluate_invocations with agent tree skipping intermediate steps."""
|
||||
invocation, expected_invocation = agent_tree_data
|
||||
@@ -927,18 +924,18 @@ Agent2 response.
|
||||
"label": "contradictory",
|
||||
}])
|
||||
|
||||
with patch(
|
||||
mocker.patch(
|
||||
"google.adk.evaluation.hallucinations_v1.HallucinationsV1Evaluator._evaluate_nl_response",
|
||||
side_effect=mock_evaluate_nl_response,
|
||||
):
|
||||
result = await metric.evaluate_invocations(
|
||||
[invocation], [expected_invocation]
|
||||
)
|
||||
)
|
||||
result = await metric.evaluate_invocations(
|
||||
[invocation], [expected_invocation]
|
||||
)
|
||||
|
||||
assert result.overall_score == 0.0
|
||||
assert len(result.per_invocation_results) == 1
|
||||
per_invocation_result = result.per_invocation_results[0]
|
||||
assert per_invocation_result.score == 0.0
|
||||
assert result.overall_score == 0.0
|
||||
assert len(result.per_invocation_results) == 1
|
||||
per_invocation_result = result.per_invocation_results[0]
|
||||
assert per_invocation_result.score == 0.0
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@@ -1053,7 +1050,7 @@ class TestEvaluateInvocationsTimeWeather:
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluate_invocations_time_weather(
|
||||
self, hallucinations_metric, time_weather_data
|
||||
self, hallucinations_metric, time_weather_data, mocker
|
||||
):
|
||||
"""Tests evaluate_invocations with time/weather agent."""
|
||||
invocation, response1, response2 = time_weather_data
|
||||
@@ -1190,20 +1187,20 @@ tool_outputs:
|
||||
])
|
||||
return None, "error"
|
||||
|
||||
with patch(
|
||||
mocker.patch(
|
||||
"google.adk.evaluation.hallucinations_v1.HallucinationsV1Evaluator._evaluate_nl_response",
|
||||
side_effect=mock_evaluate_nl_response,
|
||||
):
|
||||
result = await metric.evaluate_invocations([invocation], [invocation])
|
||||
)
|
||||
result = await metric.evaluate_invocations([invocation], [invocation])
|
||||
|
||||
assert result.overall_score == pytest.approx(5 / 6)
|
||||
assert len(result.per_invocation_results) == 1
|
||||
per_invocation_result = result.per_invocation_results[0]
|
||||
assert per_invocation_result.score == pytest.approx(5 / 6)
|
||||
assert result.overall_score == pytest.approx(5 / 6)
|
||||
assert len(result.per_invocation_results) == 1
|
||||
per_invocation_result = result.per_invocation_results[0]
|
||||
assert per_invocation_result.score == pytest.approx(5 / 6)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluate_invocations_time_weather_skip_intermediate(
|
||||
self, mock_llm_registry, time_weather_data
|
||||
self, mock_llm_registry, time_weather_data, mocker
|
||||
):
|
||||
"""Tests evaluate_invocations with time/weather agent."""
|
||||
invocation, _, response2 = time_weather_data
|
||||
@@ -1300,20 +1297,20 @@ tool_outputs:
|
||||
{"sentence": sentence2, "label": "supported"},
|
||||
])
|
||||
|
||||
with patch(
|
||||
mocker.patch(
|
||||
"google.adk.evaluation.hallucinations_v1.HallucinationsV1Evaluator._evaluate_nl_response",
|
||||
side_effect=mock_evaluate_nl_response,
|
||||
):
|
||||
result = await metric.evaluate_invocations([invocation], [invocation])
|
||||
)
|
||||
result = await metric.evaluate_invocations([invocation], [invocation])
|
||||
|
||||
assert result.overall_score == 1.0
|
||||
assert len(result.per_invocation_results) == 1
|
||||
per_invocation_result = result.per_invocation_results[0]
|
||||
assert per_invocation_result.score == 1.0
|
||||
assert result.overall_score == 1.0
|
||||
assert len(result.per_invocation_results) == 1
|
||||
per_invocation_result = result.per_invocation_results[0]
|
||||
assert per_invocation_result.score == 1.0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluate_invocations_success_path(hallucinations_metric):
|
||||
async def test_evaluate_invocations_success_path(hallucinations_metric, mocker):
|
||||
metric = hallucinations_metric
|
||||
app_details = AppDetails(
|
||||
agent_details={
|
||||
@@ -1380,18 +1377,18 @@ async def test_evaluate_invocations_success_path(hallucinations_metric):
|
||||
)
|
||||
return None, "error"
|
||||
|
||||
with patch(
|
||||
mocker.patch(
|
||||
"google.adk.evaluation.hallucinations_v1.HallucinationsV1Evaluator._evaluate_nl_response",
|
||||
side_effect=mock_evaluate_nl_response,
|
||||
):
|
||||
result = await metric.evaluate_invocations(
|
||||
[actual_invocation], [expected_invocation]
|
||||
)
|
||||
)
|
||||
result = await metric.evaluate_invocations(
|
||||
[actual_invocation], [expected_invocation]
|
||||
)
|
||||
|
||||
assert result.overall_score == pytest.approx(0.5)
|
||||
assert len(result.per_invocation_results) == 1
|
||||
per_invocation_result = result.per_invocation_results[0]
|
||||
assert per_invocation_result.score == pytest.approx(0.5)
|
||||
assert result.overall_score == pytest.approx(0.5)
|
||||
assert len(result.per_invocation_results) == 1
|
||||
per_invocation_result = result.per_invocation_results[0]
|
||||
assert per_invocation_result.score == pytest.approx(0.5)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -1446,7 +1443,9 @@ async def test_evaluate_invocations_no_nl_response(hallucinations_metric):
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluate_all_invocations_not_evaluated(hallucinations_metric):
|
||||
async def test_evaluate_all_invocations_not_evaluated(
|
||||
hallucinations_metric, mocker
|
||||
):
|
||||
metric = hallucinations_metric
|
||||
app_details = AppDetails(
|
||||
agent_details={
|
||||
@@ -1490,30 +1489,32 @@ async def test_evaluate_all_invocations_not_evaluated(hallucinations_metric):
|
||||
async def mock_evaluate_nl_response(nl_response, context):
|
||||
return None, "Judge model error."
|
||||
|
||||
with patch(
|
||||
mocker.patch(
|
||||
"google.adk.evaluation.hallucinations_v1.HallucinationsV1Evaluator._evaluate_nl_response",
|
||||
side_effect=mock_evaluate_nl_response,
|
||||
):
|
||||
result = await metric.evaluate_invocations(
|
||||
[actual_invocation, actual_invocation],
|
||||
[expected_invocation, expected_invocation],
|
||||
)
|
||||
)
|
||||
result = await metric.evaluate_invocations(
|
||||
[actual_invocation, actual_invocation],
|
||||
[expected_invocation, expected_invocation],
|
||||
)
|
||||
|
||||
assert len(result.per_invocation_results) == 2
|
||||
assert result.per_invocation_results[0].score is None
|
||||
assert (
|
||||
result.per_invocation_results[0].eval_status == EvalStatus.NOT_EVALUATED
|
||||
)
|
||||
assert result.per_invocation_results[1].score is None
|
||||
assert (
|
||||
result.per_invocation_results[1].eval_status == EvalStatus.NOT_EVALUATED
|
||||
)
|
||||
assert result.overall_score is None
|
||||
assert result.overall_eval_status == EvalStatus.NOT_EVALUATED
|
||||
assert len(result.per_invocation_results) == 2
|
||||
assert result.per_invocation_results[0].score is None
|
||||
assert (
|
||||
result.per_invocation_results[0].eval_status == EvalStatus.NOT_EVALUATED
|
||||
)
|
||||
assert result.per_invocation_results[1].score is None
|
||||
assert (
|
||||
result.per_invocation_results[1].eval_status == EvalStatus.NOT_EVALUATED
|
||||
)
|
||||
assert result.overall_score is None
|
||||
assert result.overall_eval_status == EvalStatus.NOT_EVALUATED
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluate_invocations_partial_failure(hallucinations_metric):
|
||||
async def test_evaluate_invocations_partial_failure(
|
||||
hallucinations_metric, mocker
|
||||
):
|
||||
metric = hallucinations_metric
|
||||
app_details = AppDetails(
|
||||
agent_details={
|
||||
@@ -1563,15 +1564,15 @@ async def test_evaluate_invocations_partial_failure(hallucinations_metric):
|
||||
return None, "some error during evaluation"
|
||||
return None, "error"
|
||||
|
||||
with patch(
|
||||
mocker.patch(
|
||||
"google.adk.evaluation.hallucinations_v1.HallucinationsV1Evaluator._evaluate_nl_response",
|
||||
side_effect=mock_evaluate_nl_response,
|
||||
):
|
||||
result = await metric.evaluate_invocations(
|
||||
[actual_invocation], [expected_invocation]
|
||||
)
|
||||
)
|
||||
result = await metric.evaluate_invocations(
|
||||
[actual_invocation], [expected_invocation]
|
||||
)
|
||||
|
||||
assert result.overall_score == 0.8
|
||||
assert len(result.per_invocation_results) == 1
|
||||
per_invocation_result = result.per_invocation_results[0]
|
||||
assert per_invocation_result.score == 0.8
|
||||
assert result.overall_score == 0.8
|
||||
assert len(result.per_invocation_results) == 1
|
||||
per_invocation_result = result.per_invocation_results[0]
|
||||
assert per_invocation_result.score == 0.8
|
||||
|
||||
@@ -15,7 +15,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
from google.adk.evaluation.eval_case import Invocation
|
||||
from google.adk.evaluation.eval_metrics import EvalMetric
|
||||
@@ -128,8 +127,8 @@ def test_llm_as_judge_init_unregistered_model():
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_judge_model():
|
||||
mock_judge_model = MagicMock()
|
||||
def mock_judge_model(mocker):
|
||||
mock_judge_model = mocker.MagicMock()
|
||||
|
||||
async def mock_generate_content_async(llm_request):
|
||||
yield LlmResponse(
|
||||
@@ -144,30 +143,30 @@ def mock_judge_model():
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluate_invocations_with_mock(
|
||||
mock_llm_as_judge, mock_judge_model
|
||||
mock_llm_as_judge, mock_judge_model, mocker
|
||||
):
|
||||
mock_llm_as_judge._judge_model = mock_judge_model
|
||||
|
||||
mock_format_auto_rater_prompt = MagicMock(
|
||||
mock_format_auto_rater_prompt = mocker.MagicMock(
|
||||
wraps=mock_llm_as_judge.format_auto_rater_prompt
|
||||
)
|
||||
mock_llm_as_judge.format_auto_rater_prompt = mock_format_auto_rater_prompt
|
||||
|
||||
mock_convert_auto_rater_response_to_score = MagicMock(
|
||||
mock_convert_auto_rater_response_to_score = mocker.MagicMock(
|
||||
wraps=mock_llm_as_judge.convert_auto_rater_response_to_score
|
||||
)
|
||||
mock_llm_as_judge.convert_auto_rater_response_to_score = (
|
||||
mock_convert_auto_rater_response_to_score
|
||||
)
|
||||
|
||||
mock_aggregate_per_invocation_samples = MagicMock(
|
||||
mock_aggregate_per_invocation_samples = mocker.MagicMock(
|
||||
wraps=mock_llm_as_judge.aggregate_per_invocation_samples
|
||||
)
|
||||
mock_llm_as_judge.aggregate_per_invocation_samples = (
|
||||
mock_aggregate_per_invocation_samples
|
||||
)
|
||||
|
||||
mock_aggregate_invocation_results = MagicMock(
|
||||
mock_aggregate_invocation_results = mocker.MagicMock(
|
||||
wraps=mock_llm_as_judge.aggregate_invocation_results
|
||||
)
|
||||
mock_llm_as_judge.aggregate_invocation_results = (
|
||||
|
||||
@@ -14,7 +14,6 @@
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
from unittest import mock
|
||||
|
||||
from google.adk.agents.llm_agent import LlmAgent
|
||||
from google.adk.errors.not_found_error import NotFoundError
|
||||
@@ -47,8 +46,8 @@ import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_eval_sets_manager():
|
||||
return mock.create_autospec(EvalSetsManager)
|
||||
def mock_eval_sets_manager(mocker):
|
||||
return mocker.create_autospec(EvalSetsManager)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@@ -58,8 +57,8 @@ def dummy_agent():
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_eval_set_results_manager():
|
||||
return mock.create_autospec(EvalSetResultsManager)
|
||||
def mock_eval_set_results_manager(mocker):
|
||||
return mocker.create_autospec(EvalSetResultsManager)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@@ -118,6 +117,7 @@ async def test_perform_inference_success(
|
||||
eval_service,
|
||||
dummy_agent,
|
||||
mock_eval_sets_manager,
|
||||
mocker,
|
||||
):
|
||||
eval_set = EvalSet(
|
||||
eval_set_id="test_eval_set",
|
||||
@@ -128,8 +128,8 @@ async def test_perform_inference_success(
|
||||
)
|
||||
mock_eval_sets_manager.get_eval_set.return_value = eval_set
|
||||
|
||||
mock_inference_result = mock.MagicMock()
|
||||
eval_service._perform_inference_sigle_eval_item = mock.AsyncMock(
|
||||
mock_inference_result = mocker.MagicMock()
|
||||
eval_service._perform_inference_sigle_eval_item = mocker.AsyncMock(
|
||||
return_value=mock_inference_result
|
||||
)
|
||||
|
||||
@@ -157,6 +157,7 @@ async def test_perform_inference_with_case_ids(
|
||||
eval_service,
|
||||
dummy_agent,
|
||||
mock_eval_sets_manager,
|
||||
mocker,
|
||||
):
|
||||
eval_set = EvalSet(
|
||||
eval_set_id="test_eval_set",
|
||||
@@ -168,8 +169,8 @@ async def test_perform_inference_with_case_ids(
|
||||
)
|
||||
mock_eval_sets_manager.get_eval_set.return_value = eval_set
|
||||
|
||||
mock_inference_result = mock.MagicMock()
|
||||
eval_service._perform_inference_sigle_eval_item = mock.AsyncMock(
|
||||
mock_inference_result = mocker.MagicMock()
|
||||
eval_service._perform_inference_sigle_eval_item = mocker.AsyncMock(
|
||||
return_value=mock_inference_result
|
||||
)
|
||||
|
||||
@@ -219,7 +220,7 @@ async def test_perform_inference_eval_set_not_found(
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluate_success(
|
||||
eval_service, mock_eval_sets_manager, mock_eval_set_results_manager
|
||||
eval_service, mock_eval_sets_manager, mock_eval_set_results_manager, mocker
|
||||
):
|
||||
inference_results = [
|
||||
InferenceResult(
|
||||
@@ -243,7 +244,7 @@ async def test_evaluate_success(
|
||||
evaluate_config=EvaluateConfig(eval_metrics=[eval_metric], parallelism=2),
|
||||
)
|
||||
|
||||
mock_eval_case = mock.MagicMock(spec=EvalCase)
|
||||
mock_eval_case = mocker.MagicMock(spec=EvalCase)
|
||||
mock_eval_case.conversation = []
|
||||
mock_eval_case.session_input = None
|
||||
mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case
|
||||
@@ -290,7 +291,7 @@ async def test_evaluate_eval_case_not_found(
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluate_single_inference_result(
|
||||
eval_service, mock_eval_sets_manager, mock_eval_set_results_manager
|
||||
eval_service, mock_eval_sets_manager, mock_eval_set_results_manager, mocker
|
||||
):
|
||||
invocation = Invocation(
|
||||
user_content=genai_types.Content(
|
||||
@@ -314,7 +315,7 @@ async def test_evaluate_single_inference_result(
|
||||
eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5)
|
||||
evaluate_config = EvaluateConfig(eval_metrics=[eval_metric], parallelism=1)
|
||||
|
||||
mock_eval_case = mock.MagicMock(spec=EvalCase)
|
||||
mock_eval_case = mocker.MagicMock(spec=EvalCase)
|
||||
mock_eval_case.conversation = [
|
||||
invocation.model_copy(deep=True),
|
||||
invocation.model_copy(deep=True),
|
||||
@@ -370,7 +371,7 @@ def test_generate_final_eval_status_doesn_t_throw_on(eval_service):
|
||||
@pytest.mark.skipif(
|
||||
sys.version_info < (3, 10), reason="MCP tool requires Python 3.10+"
|
||||
)
|
||||
async def test_mcp_stdio_agent_no_runtime_error():
|
||||
async def test_mcp_stdio_agent_no_runtime_error(mocker):
|
||||
"""Test that LocalEvalService can handle MCP stdio agents without RuntimeError.
|
||||
|
||||
This is a regression test for GitHub issue #2196:
|
||||
@@ -421,7 +422,7 @@ async def test_mcp_stdio_agent_no_runtime_error():
|
||||
)
|
||||
|
||||
# Create a mock eval sets manager that returns an eval case
|
||||
mock_eval_sets_manager = mock.create_autospec(EvalSetsManager)
|
||||
mock_eval_sets_manager = mocker.create_autospec(EvalSetsManager)
|
||||
test_eval_case = EvalCase(
|
||||
eval_id="test_mcp_case",
|
||||
conversation=[
|
||||
|
||||
@@ -19,7 +19,6 @@ import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import time
|
||||
from unittest.mock import patch
|
||||
|
||||
from google.adk.errors.not_found_error import NotFoundError
|
||||
from google.adk.evaluation._eval_set_results_manager_utils import _sanitize_eval_set_result_name
|
||||
@@ -68,12 +67,11 @@ class TestLocalEvalSetResultsManager:
|
||||
eval_case_results=self.eval_case_results,
|
||||
creation_timestamp=self.timestamp,
|
||||
)
|
||||
|
||||
def teardown(self):
|
||||
yield
|
||||
shutil.rmtree(self.temp_dir)
|
||||
|
||||
@patch("time.time")
|
||||
def test_save_eval_set_result(self, mock_time):
|
||||
def test_save_eval_set_result(self, mocker):
|
||||
mock_time = mocker.patch("time.time")
|
||||
mock_time.return_value = self.timestamp
|
||||
self.manager.save_eval_set_result(
|
||||
self.app_name, self.eval_set_id, self.eval_case_results
|
||||
@@ -93,8 +91,8 @@ class TestLocalEvalSetResultsManager:
|
||||
expected_eval_set_result_json = self.eval_set_result.model_dump_json()
|
||||
assert expected_eval_set_result_json == actual_eval_set_result_json
|
||||
|
||||
@patch("time.time")
|
||||
def test_get_eval_set_result(self, mock_time):
|
||||
def test_get_eval_set_result(self, mocker):
|
||||
mock_time = mocker.patch("time.time")
|
||||
mock_time.return_value = self.timestamp
|
||||
self.manager.save_eval_set_result(
|
||||
self.app_name, self.eval_set_id, self.eval_case_results
|
||||
@@ -104,15 +102,15 @@ class TestLocalEvalSetResultsManager:
|
||||
)
|
||||
assert retrieved_result == self.eval_set_result
|
||||
|
||||
@patch("time.time")
|
||||
def test_get_eval_set_result_not_found(self, mock_time):
|
||||
def test_get_eval_set_result_not_found(self, mocker):
|
||||
mock_time = mocker.patch("time.time")
|
||||
mock_time.return_value = self.timestamp
|
||||
|
||||
with pytest.raises(NotFoundError) as e:
|
||||
self.manager.get_eval_set_result(self.app_name, "non_existent_id")
|
||||
|
||||
@patch("time.time")
|
||||
def test_list_eval_set_results(self, mock_time):
|
||||
def test_list_eval_set_results(self, mocker):
|
||||
mock_time = mocker.patch("time.time")
|
||||
mock_time.return_value = self.timestamp
|
||||
# Save two eval set results for the same app
|
||||
self.manager.save_eval_set_result(
|
||||
|
||||
@@ -14,8 +14,6 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest import mock
|
||||
|
||||
from google.adk.agents.callback_context import CallbackContext
|
||||
from google.adk.evaluation.request_intercepter_plugin import _LLM_REQUEST_ID_KEY
|
||||
from google.adk.evaluation.request_intercepter_plugin import _RequestIntercepterPlugin
|
||||
@@ -26,7 +24,7 @@ from google.genai import types
|
||||
|
||||
class TestRequestIntercepterPlugin:
|
||||
|
||||
async def test_intercept_request_and_response(self):
|
||||
async def test_intercept_request_and_response(self, mocker):
|
||||
plugin = _RequestIntercepterPlugin(name="test_plugin")
|
||||
llm_request = LlmRequest(
|
||||
model="test_model",
|
||||
@@ -37,7 +35,7 @@ class TestRequestIntercepterPlugin:
|
||||
)
|
||||
],
|
||||
)
|
||||
mock_invocation_context = mock.MagicMock()
|
||||
mock_invocation_context = mocker.MagicMock()
|
||||
mock_invocation_context.session.state = {}
|
||||
callback_context = CallbackContext(mock_invocation_context)
|
||||
llm_response = LlmResponse()
|
||||
|
||||
@@ -12,8 +12,9 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
"""Tests for the Response Evaluator."""
|
||||
from unittest.mock import patch
|
||||
|
||||
from google.adk.evaluation.eval_case import Invocation
|
||||
from google.adk.evaluation.eval_metrics import PrebuiltMetrics
|
||||
@@ -24,14 +25,14 @@ import pytest
|
||||
from vertexai import types as vertexai_types
|
||||
|
||||
|
||||
@patch(
|
||||
"google.adk.evaluation.vertex_ai_eval_facade._VertexAiEvalFacade._perform_eval"
|
||||
)
|
||||
class TestResponseEvaluator:
|
||||
"""A class to help organize "patch" that are applicable to all tests."""
|
||||
|
||||
def test_evaluate_invocations_rouge_metric(self, mock_perform_eval):
|
||||
def test_evaluate_invocations_rouge_metric(self, mocker):
|
||||
"""Test evaluate_invocations function for Rouge metric."""
|
||||
mock_perform_eval = mocker.patch(
|
||||
"google.adk.evaluation.vertex_ai_eval_facade._VertexAiEvalFacade._perform_eval"
|
||||
)
|
||||
actual_invocations = [
|
||||
Invocation(
|
||||
user_content=genai_types.Content(
|
||||
@@ -67,10 +68,11 @@ class TestResponseEvaluator:
|
||||
assert evaluation_result.overall_eval_status == EvalStatus.FAILED
|
||||
mock_perform_eval.assert_not_called() # Ensure _perform_eval was not called
|
||||
|
||||
def test_evaluate_invocations_coherence_metric_passed(
|
||||
self, mock_perform_eval
|
||||
):
|
||||
def test_evaluate_invocations_coherence_metric_passed(self, mocker):
|
||||
"""Test evaluate_invocations function for Coherence metric."""
|
||||
mock_perform_eval = mocker.patch(
|
||||
"google.adk.evaluation.vertex_ai_eval_facade._VertexAiEvalFacade._perform_eval"
|
||||
)
|
||||
actual_invocations = [
|
||||
Invocation(
|
||||
user_content=genai_types.Content(
|
||||
@@ -115,7 +117,7 @@ class TestResponseEvaluator:
|
||||
vertexai_types.PrebuiltMetric.COHERENCE.name
|
||||
]
|
||||
|
||||
def test_get_metric_info_response_evaluation_score(self, mock_perform_eval):
|
||||
def test_get_metric_info_response_evaluation_score(self):
|
||||
"""Test get_metric_info function for response evaluation metric."""
|
||||
metric_info = ResponseEvaluator.get_metric_info(
|
||||
PrebuiltMetrics.RESPONSE_EVALUATION_SCORE.value
|
||||
@@ -127,7 +129,7 @@ class TestResponseEvaluator:
|
||||
assert metric_info.metric_value_info.interval.min_value == 1.0
|
||||
assert metric_info.metric_value_info.interval.max_value == 5.0
|
||||
|
||||
def test_get_metric_info_response_match_score(self, mock_perform_eval):
|
||||
def test_get_metric_info_response_match_score(self):
|
||||
"""Test get_metric_info function for response match metric."""
|
||||
metric_info = ResponseEvaluator.get_metric_info(
|
||||
PrebuiltMetrics.RESPONSE_MATCH_SCORE.value
|
||||
@@ -136,7 +138,7 @@ class TestResponseEvaluator:
|
||||
assert metric_info.metric_value_info.interval.min_value == 0.0
|
||||
assert metric_info.metric_value_info.interval.max_value == 1.0
|
||||
|
||||
def test_get_metric_info_invalid(self, mock_perform_eval):
|
||||
def test_get_metric_info_invalid(self):
|
||||
"""Test get_metric_info function for invalid metric."""
|
||||
with pytest.raises(ValueError):
|
||||
ResponseEvaluator.get_metric_info("invalid_metric")
|
||||
|
||||
@@ -13,7 +13,6 @@
|
||||
# limitations under the License.
|
||||
|
||||
"""Tests for the Response Evaluator."""
|
||||
from unittest.mock import patch
|
||||
|
||||
from google.adk.evaluation.eval_case import Invocation
|
||||
from google.adk.evaluation.eval_metrics import EvalMetric
|
||||
@@ -24,16 +23,14 @@ from google.genai import types as genai_types
|
||||
from vertexai import types as vertexai_types
|
||||
|
||||
|
||||
@patch(
|
||||
"google.adk.evaluation.vertex_ai_eval_facade._VertexAiEvalFacade._perform_eval"
|
||||
)
|
||||
class TestSafetyEvaluatorV1:
|
||||
"""A class to help organize "patch" that are applicable to all tests."""
|
||||
|
||||
def test_evaluate_invocations_coherence_metric_passed(
|
||||
self, mock_perform_eval
|
||||
):
|
||||
def test_evaluate_invocations_coherence_metric_passed(self, mocker):
|
||||
"""Test evaluate_invocations function for Coherence metric."""
|
||||
mock_perform_eval = mocker.patch(
|
||||
"google.adk.evaluation.vertex_ai_eval_facade._VertexAiEvalFacade._perform_eval"
|
||||
)
|
||||
actual_invocations = [
|
||||
Invocation(
|
||||
user_content=genai_types.Content(
|
||||
@@ -78,7 +75,7 @@ class TestSafetyEvaluatorV1:
|
||||
vertexai_types.PrebuiltMetric.SAFETY.name
|
||||
]
|
||||
|
||||
def test_get_metric_info(self, mock_perform_eval):
|
||||
def test_get_metric_info(self):
|
||||
"""Test get_metric_info function for Safety metric."""
|
||||
metric_info = SafetyEvaluatorV1.get_metric_info()
|
||||
assert metric_info.metric_name == PrebuiltMetrics.SAFETY_V1.value
|
||||
|
||||
@@ -12,10 +12,11 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
"""Tests for the Response Evaluator."""
|
||||
import math
|
||||
import random
|
||||
from unittest.mock import patch
|
||||
|
||||
from google.adk.evaluation.eval_case import Invocation
|
||||
from google.adk.evaluation.evaluator import EvalStatus
|
||||
@@ -25,14 +26,14 @@ import pytest
|
||||
from vertexai import types as vertexai_types
|
||||
|
||||
|
||||
@patch(
|
||||
"google.adk.evaluation.vertex_ai_eval_facade._VertexAiEvalFacade._perform_eval"
|
||||
)
|
||||
class TestVertexAiEvalFacade:
|
||||
"""A class to help organize "patch" that are applicable to all tests."""
|
||||
|
||||
def test_evaluate_invocations_metric_passed(self, mock_perform_eval):
|
||||
def test_evaluate_invocations_metric_passed(self, mocker):
|
||||
"""Test evaluate_invocations function for a metric."""
|
||||
mock_perform_eval = mocker.patch(
|
||||
"google.adk.evaluation.vertex_ai_eval_facade._VertexAiEvalFacade._perform_eval"
|
||||
)
|
||||
actual_invocations = [
|
||||
Invocation(
|
||||
user_content=genai_types.Content(
|
||||
@@ -77,8 +78,11 @@ class TestVertexAiEvalFacade:
|
||||
vertexai_types.PrebuiltMetric.COHERENCE.name
|
||||
]
|
||||
|
||||
def test_evaluate_invocations_metric_failed(self, mock_perform_eval):
|
||||
def test_evaluate_invocations_metric_failed(self, mocker):
|
||||
"""Test evaluate_invocations function for a metric."""
|
||||
mock_perform_eval = mocker.patch(
|
||||
"google.adk.evaluation.vertex_ai_eval_facade._VertexAiEvalFacade._perform_eval"
|
||||
)
|
||||
actual_invocations = [
|
||||
Invocation(
|
||||
user_content=genai_types.Content(
|
||||
@@ -133,9 +137,12 @@ class TestVertexAiEvalFacade:
|
||||
],
|
||||
)
|
||||
def test_evaluate_invocations_metric_no_score(
|
||||
self, mock_perform_eval, summary_metric_with_no_score
|
||||
self, mocker, summary_metric_with_no_score
|
||||
):
|
||||
"""Test evaluate_invocations function for a metric."""
|
||||
mock_perform_eval = mocker.patch(
|
||||
"google.adk.evaluation.vertex_ai_eval_facade._VertexAiEvalFacade._perform_eval"
|
||||
)
|
||||
actual_invocations = [
|
||||
Invocation(
|
||||
user_content=genai_types.Content(
|
||||
@@ -180,10 +187,11 @@ class TestVertexAiEvalFacade:
|
||||
vertexai_types.PrebuiltMetric.COHERENCE.name
|
||||
]
|
||||
|
||||
def test_evaluate_invocations_metric_multiple_invocations(
|
||||
self, mock_perform_eval
|
||||
):
|
||||
def test_evaluate_invocations_metric_multiple_invocations(self, mocker):
|
||||
"""Test evaluate_invocations function for a metric with multiple invocations."""
|
||||
mock_perform_eval = mocker.patch(
|
||||
"google.adk.evaluation.vertex_ai_eval_facade._VertexAiEvalFacade._perform_eval"
|
||||
)
|
||||
num_invocations = 6
|
||||
actual_invocations = []
|
||||
expected_invocations = []
|
||||
|
||||
Reference in New Issue
Block a user