diff --git a/tests/unittests/evaluation/test_eval_config.py b/tests/unittests/evaluation/test_eval_config.py index 150e30a6..a46b9dfc 100644 --- a/tests/unittests/evaluation/test_eval_config.py +++ b/tests/unittests/evaluation/test_eval_config.py @@ -14,8 +14,6 @@ from __future__ import annotations -from unittest import mock - from google.adk.evaluation.eval_config import _DEFAULT_EVAL_CONFIG from google.adk.evaluation.eval_config import EvalConfig from google.adk.evaluation.eval_config import get_eval_metrics_from_config @@ -28,13 +26,14 @@ def test_get_evaluation_criteria_or_default_returns_default(): assert get_evaluation_criteria_or_default("") == _DEFAULT_EVAL_CONFIG -def test_get_evaluation_criteria_or_default_reads_from_file(): +def test_get_evaluation_criteria_or_default_reads_from_file(mocker): eval_config = EvalConfig( criteria={"tool_trajectory_avg_score": 0.5, "response_match_score": 0.5} ) - mock_open = mock.mock_open(read_data=eval_config.model_dump_json()) - with mock.patch("builtins.open", mock_open): - assert get_evaluation_criteria_or_default("dummy_path") == eval_config + mocker.patch( + "builtins.open", mocker.mock_open(read_data=eval_config.model_dump_json()) + ) + assert get_evaluation_criteria_or_default("dummy_path") == eval_config def test_get_eval_metrics_from_config(): diff --git a/tests/unittests/evaluation/test_evaluation_generator.py b/tests/unittests/evaluation/test_evaluation_generator.py index 4d6d5509..133c6187 100644 --- a/tests/unittests/evaluation/test_evaluation_generator.py +++ b/tests/unittests/evaluation/test_evaluation_generator.py @@ -14,8 +14,6 @@ from __future__ import annotations -from unittest import mock - from google.adk.evaluation.app_details import AgentDetails from google.adk.evaluation.app_details import AppDetails from google.adk.evaluation.evaluation_generator import EvaluationGenerator @@ -206,17 +204,17 @@ class TestConvertEventsToEvalInvocation: class TestGetAppDetailsByInvocationId: """Test cases for EvaluationGenerator._get_app_details_by_invocation_id method.""" - def test_get_app_details_by_invocation_id_empty(self): + def test_get_app_details_by_invocation_id_empty(self, mocker): """Tests with an empty list of events.""" - mock_request_intercepter = mock.MagicMock(spec=_RequestIntercepterPlugin) + mock_request_intercepter = mocker.MagicMock(spec=_RequestIntercepterPlugin) app_details = EvaluationGenerator._get_app_details_by_invocation_id( [], mock_request_intercepter ) assert app_details == {} - def test_get_app_details_by_invocation_id_no_model_requests(self): + def test_get_app_details_by_invocation_id_no_model_requests(self, mocker): """Tests when request_intercepter returns no model requests.""" - mock_request_intercepter = mock.MagicMock(spec=_RequestIntercepterPlugin) + mock_request_intercepter = mocker.MagicMock(spec=_RequestIntercepterPlugin) mock_request_intercepter.get_model_request.return_value = None events = [ _build_event("user", [types.Part(text="Hello")], "inv1"), @@ -230,9 +228,9 @@ class TestGetAppDetailsByInvocationId: events[1] ) - def test_get_app_details_single_invocation_single_agent(self): + def test_get_app_details_single_invocation_single_agent(self, mocker): """Tests a single invocation with one agent.""" - mock_request_intercepter = mock.MagicMock(spec=_RequestIntercepterPlugin) + mock_request_intercepter = mocker.MagicMock(spec=_RequestIntercepterPlugin) mock_llm_request = LlmRequest(model="test") mock_llm_request.config.system_instruction = "instruction1" mock_llm_request.config.tools = [types.Tool()] @@ -262,9 +260,9 @@ class TestGetAppDetailsByInvocationId: events[1] ) - def test_get_app_details_multiple_invocations_multiple_agents(self): + def test_get_app_details_multiple_invocations_multiple_agents(self, mocker): """Tests multiple invocations with multiple agents.""" - mock_request_intercepter = mock.MagicMock(spec=_RequestIntercepterPlugin) + mock_request_intercepter = mocker.MagicMock(spec=_RequestIntercepterPlugin) def get_model_request_side_effect(event): mock_llm_request = LlmRequest(model="test") diff --git a/tests/unittests/evaluation/test_hallucinations_v1.py b/tests/unittests/evaluation/test_hallucinations_v1.py index d74cb24a..42953743 100644 --- a/tests/unittests/evaluation/test_hallucinations_v1.py +++ b/tests/unittests/evaluation/test_hallucinations_v1.py @@ -13,9 +13,6 @@ # limitations under the License. import json -from unittest.mock import AsyncMock -from unittest.mock import MagicMock -from unittest.mock import patch from google.adk.evaluation.app_details import AgentDetails from google.adk.evaluation.app_details import AppDetails @@ -34,13 +31,13 @@ import pytest @pytest.fixture -def mock_llm_registry(): +def mock_llm_registry(mocker): """Mocks LLMRegistry to avoid actual model loading during tests.""" - with patch( + MockLLMRegistry = mocker.patch( "google.adk.evaluation.hallucinations_v1.LLMRegistry" - ) as MockLLMRegistry: - MockLLMRegistry.return_value.resolve.return_value = AsyncMock - yield + ) + MockLLMRegistry.return_value.resolve.return_value = mocker.MagicMock() + yield @pytest.fixture @@ -176,8 +173,8 @@ contradicting_excerpt: Pears are blue fruits class TestEvaluateNlResponse: """Test cases for _evaluate_nl_response method.""" - def _create_genai_response(self, text): - response_mock = MagicMock() + def _create_genai_response(self, text, mocker): + response_mock = mocker.MagicMock() response_mock.content = genai_types.Content( parts=[genai_types.Part(text=text)] ) @@ -185,12 +182,12 @@ class TestEvaluateNlResponse: @pytest.mark.asyncio async def test_evaluate_nl_response_unexpected_labels( - self, hallucinations_metric + self, hallucinations_metric, mocker ): """Tests _evaluate_nl_response with unexpected labels.""" metric = hallucinations_metric seg_response = self._create_genai_response( - "sentence 1sentence 2" + "sentence 1sentence 2", mocker ) val_response_text = """sentence: sentence 1 label: @@ -204,7 +201,7 @@ rationale: r2 supporting_excerpt: null contradicting_excerpt: null """ - val_response = self._create_genai_response(val_response_text) + val_response = self._create_genai_response(val_response_text, mocker) async def seg_gen(): yield seg_response @@ -212,7 +209,7 @@ contradicting_excerpt: null async def val_gen(): yield val_response - metric._judge_model.generate_content_async = MagicMock( + metric._judge_model.generate_content_async = mocker.MagicMock( side_effect=[ seg_gen(), val_gen(), @@ -223,14 +220,14 @@ contradicting_excerpt: null @pytest.mark.asyncio async def test_evaluate_nl_response_missing_label( - self, hallucinations_metric + self, hallucinations_metric, mocker ): """Tests _evaluate_nl_response with missing labels in validation results.""" metric = hallucinations_metric seg_response = self._create_genai_response( - "sentence 1" + "sentence 1", mocker ) - val_response = self._create_genai_response("val_response") + val_response = self._create_genai_response("val_response", mocker) async def seg_gen(): yield seg_response @@ -238,7 +235,7 @@ contradicting_excerpt: null async def val_gen(): yield val_response - metric._judge_model.generate_content_async = MagicMock( + metric._judge_model.generate_content_async = mocker.MagicMock( side_effect=[ seg_gen(), val_gen(), @@ -585,7 +582,7 @@ class TestEvaluateInvocationsAgentTree: @pytest.mark.asyncio async def test_evaluate_invocations_multi_agents( - self, hallucinations_metric, agent_tree_data + self, hallucinations_metric, agent_tree_data, mocker ): """Tests evaluate_invocations with agent tree and checks contexts.""" invocation, expected_invocation = agent_tree_data @@ -804,22 +801,22 @@ Agent2 response. }]) return None, "error" - with patch( + mocker.patch( "google.adk.evaluation.hallucinations_v1.HallucinationsV1Evaluator._evaluate_nl_response", side_effect=mock_evaluate_nl_response, - ): - result = await metric.evaluate_invocations( - [invocation], [expected_invocation] - ) + ) + result = await metric.evaluate_invocations( + [invocation], [expected_invocation] + ) - assert result.overall_score == pytest.approx(0.5) - assert len(result.per_invocation_results) == 1 - per_invocation_result = result.per_invocation_results[0] - assert per_invocation_result.score == pytest.approx(0.5) + assert result.overall_score == pytest.approx(0.5) + assert len(result.per_invocation_results) == 1 + per_invocation_result = result.per_invocation_results[0] + assert per_invocation_result.score == pytest.approx(0.5) @pytest.mark.asyncio async def test_evaluate_invocations_agent_tree_skip_intermediate( - self, mock_llm_registry, agent_tree_data + self, mock_llm_registry, agent_tree_data, mocker ): """Tests evaluate_invocations with agent tree skipping intermediate steps.""" invocation, expected_invocation = agent_tree_data @@ -927,18 +924,18 @@ Agent2 response. "label": "contradictory", }]) - with patch( + mocker.patch( "google.adk.evaluation.hallucinations_v1.HallucinationsV1Evaluator._evaluate_nl_response", side_effect=mock_evaluate_nl_response, - ): - result = await metric.evaluate_invocations( - [invocation], [expected_invocation] - ) + ) + result = await metric.evaluate_invocations( + [invocation], [expected_invocation] + ) - assert result.overall_score == 0.0 - assert len(result.per_invocation_results) == 1 - per_invocation_result = result.per_invocation_results[0] - assert per_invocation_result.score == 0.0 + assert result.overall_score == 0.0 + assert len(result.per_invocation_results) == 1 + per_invocation_result = result.per_invocation_results[0] + assert per_invocation_result.score == 0.0 @pytest.fixture @@ -1053,7 +1050,7 @@ class TestEvaluateInvocationsTimeWeather: @pytest.mark.asyncio async def test_evaluate_invocations_time_weather( - self, hallucinations_metric, time_weather_data + self, hallucinations_metric, time_weather_data, mocker ): """Tests evaluate_invocations with time/weather agent.""" invocation, response1, response2 = time_weather_data @@ -1190,20 +1187,20 @@ tool_outputs: ]) return None, "error" - with patch( + mocker.patch( "google.adk.evaluation.hallucinations_v1.HallucinationsV1Evaluator._evaluate_nl_response", side_effect=mock_evaluate_nl_response, - ): - result = await metric.evaluate_invocations([invocation], [invocation]) + ) + result = await metric.evaluate_invocations([invocation], [invocation]) - assert result.overall_score == pytest.approx(5 / 6) - assert len(result.per_invocation_results) == 1 - per_invocation_result = result.per_invocation_results[0] - assert per_invocation_result.score == pytest.approx(5 / 6) + assert result.overall_score == pytest.approx(5 / 6) + assert len(result.per_invocation_results) == 1 + per_invocation_result = result.per_invocation_results[0] + assert per_invocation_result.score == pytest.approx(5 / 6) @pytest.mark.asyncio async def test_evaluate_invocations_time_weather_skip_intermediate( - self, mock_llm_registry, time_weather_data + self, mock_llm_registry, time_weather_data, mocker ): """Tests evaluate_invocations with time/weather agent.""" invocation, _, response2 = time_weather_data @@ -1300,20 +1297,20 @@ tool_outputs: {"sentence": sentence2, "label": "supported"}, ]) - with patch( + mocker.patch( "google.adk.evaluation.hallucinations_v1.HallucinationsV1Evaluator._evaluate_nl_response", side_effect=mock_evaluate_nl_response, - ): - result = await metric.evaluate_invocations([invocation], [invocation]) + ) + result = await metric.evaluate_invocations([invocation], [invocation]) - assert result.overall_score == 1.0 - assert len(result.per_invocation_results) == 1 - per_invocation_result = result.per_invocation_results[0] - assert per_invocation_result.score == 1.0 + assert result.overall_score == 1.0 + assert len(result.per_invocation_results) == 1 + per_invocation_result = result.per_invocation_results[0] + assert per_invocation_result.score == 1.0 @pytest.mark.asyncio -async def test_evaluate_invocations_success_path(hallucinations_metric): +async def test_evaluate_invocations_success_path(hallucinations_metric, mocker): metric = hallucinations_metric app_details = AppDetails( agent_details={ @@ -1380,18 +1377,18 @@ async def test_evaluate_invocations_success_path(hallucinations_metric): ) return None, "error" - with patch( + mocker.patch( "google.adk.evaluation.hallucinations_v1.HallucinationsV1Evaluator._evaluate_nl_response", side_effect=mock_evaluate_nl_response, - ): - result = await metric.evaluate_invocations( - [actual_invocation], [expected_invocation] - ) + ) + result = await metric.evaluate_invocations( + [actual_invocation], [expected_invocation] + ) - assert result.overall_score == pytest.approx(0.5) - assert len(result.per_invocation_results) == 1 - per_invocation_result = result.per_invocation_results[0] - assert per_invocation_result.score == pytest.approx(0.5) + assert result.overall_score == pytest.approx(0.5) + assert len(result.per_invocation_results) == 1 + per_invocation_result = result.per_invocation_results[0] + assert per_invocation_result.score == pytest.approx(0.5) @pytest.mark.asyncio @@ -1446,7 +1443,9 @@ async def test_evaluate_invocations_no_nl_response(hallucinations_metric): @pytest.mark.asyncio -async def test_evaluate_all_invocations_not_evaluated(hallucinations_metric): +async def test_evaluate_all_invocations_not_evaluated( + hallucinations_metric, mocker +): metric = hallucinations_metric app_details = AppDetails( agent_details={ @@ -1490,30 +1489,32 @@ async def test_evaluate_all_invocations_not_evaluated(hallucinations_metric): async def mock_evaluate_nl_response(nl_response, context): return None, "Judge model error." - with patch( + mocker.patch( "google.adk.evaluation.hallucinations_v1.HallucinationsV1Evaluator._evaluate_nl_response", side_effect=mock_evaluate_nl_response, - ): - result = await metric.evaluate_invocations( - [actual_invocation, actual_invocation], - [expected_invocation, expected_invocation], - ) + ) + result = await metric.evaluate_invocations( + [actual_invocation, actual_invocation], + [expected_invocation, expected_invocation], + ) - assert len(result.per_invocation_results) == 2 - assert result.per_invocation_results[0].score is None - assert ( - result.per_invocation_results[0].eval_status == EvalStatus.NOT_EVALUATED - ) - assert result.per_invocation_results[1].score is None - assert ( - result.per_invocation_results[1].eval_status == EvalStatus.NOT_EVALUATED - ) - assert result.overall_score is None - assert result.overall_eval_status == EvalStatus.NOT_EVALUATED + assert len(result.per_invocation_results) == 2 + assert result.per_invocation_results[0].score is None + assert ( + result.per_invocation_results[0].eval_status == EvalStatus.NOT_EVALUATED + ) + assert result.per_invocation_results[1].score is None + assert ( + result.per_invocation_results[1].eval_status == EvalStatus.NOT_EVALUATED + ) + assert result.overall_score is None + assert result.overall_eval_status == EvalStatus.NOT_EVALUATED @pytest.mark.asyncio -async def test_evaluate_invocations_partial_failure(hallucinations_metric): +async def test_evaluate_invocations_partial_failure( + hallucinations_metric, mocker +): metric = hallucinations_metric app_details = AppDetails( agent_details={ @@ -1563,15 +1564,15 @@ async def test_evaluate_invocations_partial_failure(hallucinations_metric): return None, "some error during evaluation" return None, "error" - with patch( + mocker.patch( "google.adk.evaluation.hallucinations_v1.HallucinationsV1Evaluator._evaluate_nl_response", side_effect=mock_evaluate_nl_response, - ): - result = await metric.evaluate_invocations( - [actual_invocation], [expected_invocation] - ) + ) + result = await metric.evaluate_invocations( + [actual_invocation], [expected_invocation] + ) - assert result.overall_score == 0.8 - assert len(result.per_invocation_results) == 1 - per_invocation_result = result.per_invocation_results[0] - assert per_invocation_result.score == 0.8 + assert result.overall_score == 0.8 + assert len(result.per_invocation_results) == 1 + per_invocation_result = result.per_invocation_results[0] + assert per_invocation_result.score == 0.8 diff --git a/tests/unittests/evaluation/test_llm_as_judge.py b/tests/unittests/evaluation/test_llm_as_judge.py index 6618e6c1..eb5a1154 100644 --- a/tests/unittests/evaluation/test_llm_as_judge.py +++ b/tests/unittests/evaluation/test_llm_as_judge.py @@ -15,7 +15,6 @@ from __future__ import annotations from typing import Optional -from unittest.mock import MagicMock from google.adk.evaluation.eval_case import Invocation from google.adk.evaluation.eval_metrics import EvalMetric @@ -128,8 +127,8 @@ def test_llm_as_judge_init_unregistered_model(): @pytest.fixture -def mock_judge_model(): - mock_judge_model = MagicMock() +def mock_judge_model(mocker): + mock_judge_model = mocker.MagicMock() async def mock_generate_content_async(llm_request): yield LlmResponse( @@ -144,30 +143,30 @@ def mock_judge_model(): @pytest.mark.asyncio async def test_evaluate_invocations_with_mock( - mock_llm_as_judge, mock_judge_model + mock_llm_as_judge, mock_judge_model, mocker ): mock_llm_as_judge._judge_model = mock_judge_model - mock_format_auto_rater_prompt = MagicMock( + mock_format_auto_rater_prompt = mocker.MagicMock( wraps=mock_llm_as_judge.format_auto_rater_prompt ) mock_llm_as_judge.format_auto_rater_prompt = mock_format_auto_rater_prompt - mock_convert_auto_rater_response_to_score = MagicMock( + mock_convert_auto_rater_response_to_score = mocker.MagicMock( wraps=mock_llm_as_judge.convert_auto_rater_response_to_score ) mock_llm_as_judge.convert_auto_rater_response_to_score = ( mock_convert_auto_rater_response_to_score ) - mock_aggregate_per_invocation_samples = MagicMock( + mock_aggregate_per_invocation_samples = mocker.MagicMock( wraps=mock_llm_as_judge.aggregate_per_invocation_samples ) mock_llm_as_judge.aggregate_per_invocation_samples = ( mock_aggregate_per_invocation_samples ) - mock_aggregate_invocation_results = MagicMock( + mock_aggregate_invocation_results = mocker.MagicMock( wraps=mock_llm_as_judge.aggregate_invocation_results ) mock_llm_as_judge.aggregate_invocation_results = ( diff --git a/tests/unittests/evaluation/test_local_eval_service.py b/tests/unittests/evaluation/test_local_eval_service.py index f68136ca..c9010444 100644 --- a/tests/unittests/evaluation/test_local_eval_service.py +++ b/tests/unittests/evaluation/test_local_eval_service.py @@ -14,7 +14,6 @@ import asyncio import sys -from unittest import mock from google.adk.agents.llm_agent import LlmAgent from google.adk.errors.not_found_error import NotFoundError @@ -47,8 +46,8 @@ import pytest @pytest.fixture -def mock_eval_sets_manager(): - return mock.create_autospec(EvalSetsManager) +def mock_eval_sets_manager(mocker): + return mocker.create_autospec(EvalSetsManager) @pytest.fixture @@ -58,8 +57,8 @@ def dummy_agent(): @pytest.fixture -def mock_eval_set_results_manager(): - return mock.create_autospec(EvalSetResultsManager) +def mock_eval_set_results_manager(mocker): + return mocker.create_autospec(EvalSetResultsManager) @pytest.fixture @@ -118,6 +117,7 @@ async def test_perform_inference_success( eval_service, dummy_agent, mock_eval_sets_manager, + mocker, ): eval_set = EvalSet( eval_set_id="test_eval_set", @@ -128,8 +128,8 @@ async def test_perform_inference_success( ) mock_eval_sets_manager.get_eval_set.return_value = eval_set - mock_inference_result = mock.MagicMock() - eval_service._perform_inference_sigle_eval_item = mock.AsyncMock( + mock_inference_result = mocker.MagicMock() + eval_service._perform_inference_sigle_eval_item = mocker.AsyncMock( return_value=mock_inference_result ) @@ -157,6 +157,7 @@ async def test_perform_inference_with_case_ids( eval_service, dummy_agent, mock_eval_sets_manager, + mocker, ): eval_set = EvalSet( eval_set_id="test_eval_set", @@ -168,8 +169,8 @@ async def test_perform_inference_with_case_ids( ) mock_eval_sets_manager.get_eval_set.return_value = eval_set - mock_inference_result = mock.MagicMock() - eval_service._perform_inference_sigle_eval_item = mock.AsyncMock( + mock_inference_result = mocker.MagicMock() + eval_service._perform_inference_sigle_eval_item = mocker.AsyncMock( return_value=mock_inference_result ) @@ -219,7 +220,7 @@ async def test_perform_inference_eval_set_not_found( @pytest.mark.asyncio async def test_evaluate_success( - eval_service, mock_eval_sets_manager, mock_eval_set_results_manager + eval_service, mock_eval_sets_manager, mock_eval_set_results_manager, mocker ): inference_results = [ InferenceResult( @@ -243,7 +244,7 @@ async def test_evaluate_success( evaluate_config=EvaluateConfig(eval_metrics=[eval_metric], parallelism=2), ) - mock_eval_case = mock.MagicMock(spec=EvalCase) + mock_eval_case = mocker.MagicMock(spec=EvalCase) mock_eval_case.conversation = [] mock_eval_case.session_input = None mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case @@ -290,7 +291,7 @@ async def test_evaluate_eval_case_not_found( @pytest.mark.asyncio async def test_evaluate_single_inference_result( - eval_service, mock_eval_sets_manager, mock_eval_set_results_manager + eval_service, mock_eval_sets_manager, mock_eval_set_results_manager, mocker ): invocation = Invocation( user_content=genai_types.Content( @@ -314,7 +315,7 @@ async def test_evaluate_single_inference_result( eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5) evaluate_config = EvaluateConfig(eval_metrics=[eval_metric], parallelism=1) - mock_eval_case = mock.MagicMock(spec=EvalCase) + mock_eval_case = mocker.MagicMock(spec=EvalCase) mock_eval_case.conversation = [ invocation.model_copy(deep=True), invocation.model_copy(deep=True), @@ -370,7 +371,7 @@ def test_generate_final_eval_status_doesn_t_throw_on(eval_service): @pytest.mark.skipif( sys.version_info < (3, 10), reason="MCP tool requires Python 3.10+" ) -async def test_mcp_stdio_agent_no_runtime_error(): +async def test_mcp_stdio_agent_no_runtime_error(mocker): """Test that LocalEvalService can handle MCP stdio agents without RuntimeError. This is a regression test for GitHub issue #2196: @@ -421,7 +422,7 @@ async def test_mcp_stdio_agent_no_runtime_error(): ) # Create a mock eval sets manager that returns an eval case - mock_eval_sets_manager = mock.create_autospec(EvalSetsManager) + mock_eval_sets_manager = mocker.create_autospec(EvalSetsManager) test_eval_case = EvalCase( eval_id="test_mcp_case", conversation=[ diff --git a/tests/unittests/evaluation/test_local_eval_set_results_manager.py b/tests/unittests/evaluation/test_local_eval_set_results_manager.py index 3411d9b7..45500d71 100644 --- a/tests/unittests/evaluation/test_local_eval_set_results_manager.py +++ b/tests/unittests/evaluation/test_local_eval_set_results_manager.py @@ -19,7 +19,6 @@ import os import shutil import tempfile import time -from unittest.mock import patch from google.adk.errors.not_found_error import NotFoundError from google.adk.evaluation._eval_set_results_manager_utils import _sanitize_eval_set_result_name @@ -68,12 +67,11 @@ class TestLocalEvalSetResultsManager: eval_case_results=self.eval_case_results, creation_timestamp=self.timestamp, ) - - def teardown(self): + yield shutil.rmtree(self.temp_dir) - @patch("time.time") - def test_save_eval_set_result(self, mock_time): + def test_save_eval_set_result(self, mocker): + mock_time = mocker.patch("time.time") mock_time.return_value = self.timestamp self.manager.save_eval_set_result( self.app_name, self.eval_set_id, self.eval_case_results @@ -93,8 +91,8 @@ class TestLocalEvalSetResultsManager: expected_eval_set_result_json = self.eval_set_result.model_dump_json() assert expected_eval_set_result_json == actual_eval_set_result_json - @patch("time.time") - def test_get_eval_set_result(self, mock_time): + def test_get_eval_set_result(self, mocker): + mock_time = mocker.patch("time.time") mock_time.return_value = self.timestamp self.manager.save_eval_set_result( self.app_name, self.eval_set_id, self.eval_case_results @@ -104,15 +102,15 @@ class TestLocalEvalSetResultsManager: ) assert retrieved_result == self.eval_set_result - @patch("time.time") - def test_get_eval_set_result_not_found(self, mock_time): + def test_get_eval_set_result_not_found(self, mocker): + mock_time = mocker.patch("time.time") mock_time.return_value = self.timestamp with pytest.raises(NotFoundError) as e: self.manager.get_eval_set_result(self.app_name, "non_existent_id") - @patch("time.time") - def test_list_eval_set_results(self, mock_time): + def test_list_eval_set_results(self, mocker): + mock_time = mocker.patch("time.time") mock_time.return_value = self.timestamp # Save two eval set results for the same app self.manager.save_eval_set_result( diff --git a/tests/unittests/evaluation/test_request_intercepter_plugin.py b/tests/unittests/evaluation/test_request_intercepter_plugin.py index 1381e001..3fa0aa50 100644 --- a/tests/unittests/evaluation/test_request_intercepter_plugin.py +++ b/tests/unittests/evaluation/test_request_intercepter_plugin.py @@ -14,8 +14,6 @@ from __future__ import annotations -from unittest import mock - from google.adk.agents.callback_context import CallbackContext from google.adk.evaluation.request_intercepter_plugin import _LLM_REQUEST_ID_KEY from google.adk.evaluation.request_intercepter_plugin import _RequestIntercepterPlugin @@ -26,7 +24,7 @@ from google.genai import types class TestRequestIntercepterPlugin: - async def test_intercept_request_and_response(self): + async def test_intercept_request_and_response(self, mocker): plugin = _RequestIntercepterPlugin(name="test_plugin") llm_request = LlmRequest( model="test_model", @@ -37,7 +35,7 @@ class TestRequestIntercepterPlugin: ) ], ) - mock_invocation_context = mock.MagicMock() + mock_invocation_context = mocker.MagicMock() mock_invocation_context.session.state = {} callback_context = CallbackContext(mock_invocation_context) llm_response = LlmResponse() diff --git a/tests/unittests/evaluation/test_response_evaluator.py b/tests/unittests/evaluation/test_response_evaluator.py index bace9c6a..8bf4b76f 100644 --- a/tests/unittests/evaluation/test_response_evaluator.py +++ b/tests/unittests/evaluation/test_response_evaluator.py @@ -12,8 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + """Tests for the Response Evaluator.""" -from unittest.mock import patch from google.adk.evaluation.eval_case import Invocation from google.adk.evaluation.eval_metrics import PrebuiltMetrics @@ -24,14 +25,14 @@ import pytest from vertexai import types as vertexai_types -@patch( - "google.adk.evaluation.vertex_ai_eval_facade._VertexAiEvalFacade._perform_eval" -) class TestResponseEvaluator: """A class to help organize "patch" that are applicable to all tests.""" - def test_evaluate_invocations_rouge_metric(self, mock_perform_eval): + def test_evaluate_invocations_rouge_metric(self, mocker): """Test evaluate_invocations function for Rouge metric.""" + mock_perform_eval = mocker.patch( + "google.adk.evaluation.vertex_ai_eval_facade._VertexAiEvalFacade._perform_eval" + ) actual_invocations = [ Invocation( user_content=genai_types.Content( @@ -67,10 +68,11 @@ class TestResponseEvaluator: assert evaluation_result.overall_eval_status == EvalStatus.FAILED mock_perform_eval.assert_not_called() # Ensure _perform_eval was not called - def test_evaluate_invocations_coherence_metric_passed( - self, mock_perform_eval - ): + def test_evaluate_invocations_coherence_metric_passed(self, mocker): """Test evaluate_invocations function for Coherence metric.""" + mock_perform_eval = mocker.patch( + "google.adk.evaluation.vertex_ai_eval_facade._VertexAiEvalFacade._perform_eval" + ) actual_invocations = [ Invocation( user_content=genai_types.Content( @@ -115,7 +117,7 @@ class TestResponseEvaluator: vertexai_types.PrebuiltMetric.COHERENCE.name ] - def test_get_metric_info_response_evaluation_score(self, mock_perform_eval): + def test_get_metric_info_response_evaluation_score(self): """Test get_metric_info function for response evaluation metric.""" metric_info = ResponseEvaluator.get_metric_info( PrebuiltMetrics.RESPONSE_EVALUATION_SCORE.value @@ -127,7 +129,7 @@ class TestResponseEvaluator: assert metric_info.metric_value_info.interval.min_value == 1.0 assert metric_info.metric_value_info.interval.max_value == 5.0 - def test_get_metric_info_response_match_score(self, mock_perform_eval): + def test_get_metric_info_response_match_score(self): """Test get_metric_info function for response match metric.""" metric_info = ResponseEvaluator.get_metric_info( PrebuiltMetrics.RESPONSE_MATCH_SCORE.value @@ -136,7 +138,7 @@ class TestResponseEvaluator: assert metric_info.metric_value_info.interval.min_value == 0.0 assert metric_info.metric_value_info.interval.max_value == 1.0 - def test_get_metric_info_invalid(self, mock_perform_eval): + def test_get_metric_info_invalid(self): """Test get_metric_info function for invalid metric.""" with pytest.raises(ValueError): ResponseEvaluator.get_metric_info("invalid_metric") diff --git a/tests/unittests/evaluation/test_safety_evaluator.py b/tests/unittests/evaluation/test_safety_evaluator.py index 5cc95b1d..29acf969 100644 --- a/tests/unittests/evaluation/test_safety_evaluator.py +++ b/tests/unittests/evaluation/test_safety_evaluator.py @@ -13,7 +13,6 @@ # limitations under the License. """Tests for the Response Evaluator.""" -from unittest.mock import patch from google.adk.evaluation.eval_case import Invocation from google.adk.evaluation.eval_metrics import EvalMetric @@ -24,16 +23,14 @@ from google.genai import types as genai_types from vertexai import types as vertexai_types -@patch( - "google.adk.evaluation.vertex_ai_eval_facade._VertexAiEvalFacade._perform_eval" -) class TestSafetyEvaluatorV1: """A class to help organize "patch" that are applicable to all tests.""" - def test_evaluate_invocations_coherence_metric_passed( - self, mock_perform_eval - ): + def test_evaluate_invocations_coherence_metric_passed(self, mocker): """Test evaluate_invocations function for Coherence metric.""" + mock_perform_eval = mocker.patch( + "google.adk.evaluation.vertex_ai_eval_facade._VertexAiEvalFacade._perform_eval" + ) actual_invocations = [ Invocation( user_content=genai_types.Content( @@ -78,7 +75,7 @@ class TestSafetyEvaluatorV1: vertexai_types.PrebuiltMetric.SAFETY.name ] - def test_get_metric_info(self, mock_perform_eval): + def test_get_metric_info(self): """Test get_metric_info function for Safety metric.""" metric_info = SafetyEvaluatorV1.get_metric_info() assert metric_info.metric_name == PrebuiltMetrics.SAFETY_V1.value diff --git a/tests/unittests/evaluation/test_vertex_ai_eval_facade.py b/tests/unittests/evaluation/test_vertex_ai_eval_facade.py index 8fd1705c..ee4ff1d7 100644 --- a/tests/unittests/evaluation/test_vertex_ai_eval_facade.py +++ b/tests/unittests/evaluation/test_vertex_ai_eval_facade.py @@ -12,10 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + """Tests for the Response Evaluator.""" import math import random -from unittest.mock import patch from google.adk.evaluation.eval_case import Invocation from google.adk.evaluation.evaluator import EvalStatus @@ -25,14 +26,14 @@ import pytest from vertexai import types as vertexai_types -@patch( - "google.adk.evaluation.vertex_ai_eval_facade._VertexAiEvalFacade._perform_eval" -) class TestVertexAiEvalFacade: """A class to help organize "patch" that are applicable to all tests.""" - def test_evaluate_invocations_metric_passed(self, mock_perform_eval): + def test_evaluate_invocations_metric_passed(self, mocker): """Test evaluate_invocations function for a metric.""" + mock_perform_eval = mocker.patch( + "google.adk.evaluation.vertex_ai_eval_facade._VertexAiEvalFacade._perform_eval" + ) actual_invocations = [ Invocation( user_content=genai_types.Content( @@ -77,8 +78,11 @@ class TestVertexAiEvalFacade: vertexai_types.PrebuiltMetric.COHERENCE.name ] - def test_evaluate_invocations_metric_failed(self, mock_perform_eval): + def test_evaluate_invocations_metric_failed(self, mocker): """Test evaluate_invocations function for a metric.""" + mock_perform_eval = mocker.patch( + "google.adk.evaluation.vertex_ai_eval_facade._VertexAiEvalFacade._perform_eval" + ) actual_invocations = [ Invocation( user_content=genai_types.Content( @@ -133,9 +137,12 @@ class TestVertexAiEvalFacade: ], ) def test_evaluate_invocations_metric_no_score( - self, mock_perform_eval, summary_metric_with_no_score + self, mocker, summary_metric_with_no_score ): """Test evaluate_invocations function for a metric.""" + mock_perform_eval = mocker.patch( + "google.adk.evaluation.vertex_ai_eval_facade._VertexAiEvalFacade._perform_eval" + ) actual_invocations = [ Invocation( user_content=genai_types.Content( @@ -180,10 +187,11 @@ class TestVertexAiEvalFacade: vertexai_types.PrebuiltMetric.COHERENCE.name ] - def test_evaluate_invocations_metric_multiple_invocations( - self, mock_perform_eval - ): + def test_evaluate_invocations_metric_multiple_invocations(self, mocker): """Test evaluate_invocations function for a metric with multiple invocations.""" + mock_perform_eval = mocker.patch( + "google.adk.evaluation.vertex_ai_eval_facade._VertexAiEvalFacade._perform_eval" + ) num_invocations = 6 actual_invocations = [] expected_invocations = []