chore: Marked expected_invocation as optional field on evaluator interface

ADK already has a set of metrics that don't rely expected_invocations. Also, for eval cases with conversation scenario, this would be the main line case.

PiperOrigin-RevId: 825101481
This commit is contained in:
Ankur Sharma
2025-10-28 10:27:06 -07:00
committed by Copybara-Service
parent 9ab17f2afd
commit b17c8f19e5
15 changed files with 282 additions and 102 deletions
+9 -7
View File
@@ -210,21 +210,23 @@ def pretty_print_eval_result(eval_result: EvalCaseResult):
data = []
for per_invocation_result in eval_result.eval_metric_result_per_invocation:
actual_invocation = per_invocation_result.actual_invocation
expected_invocation = per_invocation_result.expected_invocation
row_data = {
"prompt": _convert_content_to_text(
per_invocation_result.expected_invocation.user_content
),
"prompt": _convert_content_to_text(actual_invocation.user_content),
"expected_response": _convert_content_to_text(
per_invocation_result.expected_invocation.final_response
expected_invocation.final_response if expected_invocation else None
),
"actual_response": _convert_content_to_text(
per_invocation_result.actual_invocation.final_response
actual_invocation.final_response
),
"expected_tool_calls": _convert_tool_calls_to_text(
per_invocation_result.expected_invocation.intermediate_data
expected_invocation.intermediate_data
if expected_invocation
else None
),
"actual_tool_calls": _convert_tool_calls_to_text(
per_invocation_result.actual_invocation.intermediate_data
actual_invocation.intermediate_data
),
}
for metric_result in per_invocation_result.eval_metric_results:
+3 -2
View File
@@ -216,10 +216,11 @@ class EvalMetricResultPerInvocation(EvalBaseModel):
)
)
expected_invocation: Invocation = Field(
expected_invocation: Optional[Invocation] = Field(
default=None,
description=(
"The expected invocation, usually the reference or golden invocation."
)
),
)
eval_metric_results: list[EvalMetricResult] = Field(
+12 -3
View File
@@ -33,7 +33,7 @@ class PerInvocationResult(BaseModel):
"""Metric evaluation score per invocation."""
actual_invocation: Invocation
expected_invocation: Invocation
expected_invocation: Optional[Invocation] = None
score: Optional[float] = None
eval_status: EvalStatus = EvalStatus.NOT_EVALUATED
rubric_scores: Optional[list[RubricScore]] = None
@@ -61,7 +61,16 @@ class Evaluator(ABC):
def evaluate_invocations(
self,
actual_invocations: list[Invocation],
expected_invocations: list[Invocation],
expected_invocations: Optional[list[Invocation]],
) -> EvaluationResult:
"""Returns EvaluationResult after performing evaluations using actual and expected invocations."""
"""Returns EvaluationResult after performing evaluations using actual and expected invocations.
Args:
actual_invocations: These are the invocations that are obtained from the
agent under test.
expected_invocations: An optional list of invocations, if specified,
usually act as a benchmark/golden response. If these are specified
usually the expectation is that the length of this list and actual
invocaiton is the same.
"""
raise NotImplementedError()
@@ -59,8 +59,11 @@ class RougeEvaluator(Evaluator):
def evaluate_invocations(
self,
actual_invocations: list[Invocation],
expected_invocations: list[Invocation],
expected_invocations: Optional[list[Invocation]],
) -> EvaluationResult:
if expected_invocations is None:
raise ValueError("expected_invocations is required for this metric.")
total_score = 0.0
num_invocations = 0
per_invocation_results = []
@@ -147,7 +147,11 @@ class FinalResponseMatchV2Evaluator(LlmAsJudge):
self,
eval_metric: EvalMetric,
):
super().__init__(eval_metric, FinalResponseMatchV2Evaluator.criterion_type)
super().__init__(
eval_metric,
FinalResponseMatchV2Evaluator.criterion_type,
expected_invocations_required=True,
)
self._auto_rater_prompt_template = _FINAL_RESPONSE_MATCH_V2_PROMPT
@staticmethod
@@ -166,8 +170,13 @@ class FinalResponseMatchV2Evaluator(LlmAsJudge):
@override
def format_auto_rater_prompt(
self, actual_invocation: Invocation, expected_invocation: Invocation
self,
actual_invocation: Invocation,
expected_invocation: Optional[Invocation],
) -> str:
if expected_invocation is None:
raise ValueError("expected_invocation is required for this metric.")
reference = get_text_from_content(expected_invocation.final_response)
response = get_text_from_content(actual_invocation.final_response)
user_prompt = get_text_from_content(expected_invocation.user_content)
+12 -3
View File
@@ -395,7 +395,8 @@ class HallucinationsV1Evaluator(Evaluator):
},
{
"name": "get_weather",
"description": '''Gets the weather of the given place at the given time.
"description": '''Gets the weather of the given place at the given
time.
Args:
location: The location for which to retrieve weather information.
@@ -408,7 +409,8 @@ class HallucinationsV1Evaluator(Evaluator):
"type": "object",
"properties": {
"location": {
"description": "The location for which to retrieve weather information.",
"description": "The location for which to retrieve weather
information.",
"type": "string"
},
"time": {
@@ -711,8 +713,15 @@ class HallucinationsV1Evaluator(Evaluator):
async def evaluate_invocations(
self,
actual_invocations: list[Invocation],
expected_invocations: list[Invocation],
expected_invocations: Optional[list[Invocation]],
) -> EvaluationResult:
# expected_invocations are not required by the metric and if they are not
# supplied, we provide an a list of None to rest of the code.
expected_invocations = (
[None] * len(actual_invocations)
if expected_invocations is None
else expected_invocations
)
per_invocation_results = []
for actual, expected in zip(actual_invocations, expected_invocations):
step_evaluations = self._get_steps_to_evaluate(actual)
+18 -3
View File
@@ -60,9 +60,13 @@ class LlmAsJudge(Evaluator):
"""
def __init__(
self, eval_metric: EvalMetric, criterion_type: type[BaseCriterion]
self,
eval_metric: EvalMetric,
criterion_type: type[BaseCriterion],
expected_invocations_required=False,
):
self._eval_metric = eval_metric
self._expected_invocations_required = expected_invocations_required
expected_criterion_type_error = ValueError(
f"`{eval_metric.metric_name}` metric expects a criterion of type"
@@ -84,7 +88,7 @@ class LlmAsJudge(Evaluator):
@abstractmethod
def format_auto_rater_prompt(
self, actual: Invocation, expected: Invocation
self, actual: Invocation, expected: Optional[Invocation]
) -> str:
"""Formats the auto-rater prompt to evaluate the given invocation."""
@@ -112,8 +116,19 @@ class LlmAsJudge(Evaluator):
async def evaluate_invocations(
self,
actual_invocations: list[Invocation],
expected_invocations: list[Invocation],
expected_invocations: Optional[list[Invocation]],
) -> EvaluationResult:
if self._expected_invocations_required and expected_invocations is None:
raise ValueError("expected_invocations is needed by this metric.")
# If expected_invocation are not required by the metric and if they are not
# supplied, we provide an a list of None.
expected_invocations = (
[None] * len(actual_invocations)
if expected_invocations is None
else expected_invocations
)
per_invocation_results = []
for actual, expected in zip(actual_invocations, expected_invocations):
auto_rater_prompt = self.format_auto_rater_prompt(actual, expected)
+42 -55
View File
@@ -22,8 +22,6 @@ from typing import Callable
from typing import Optional
import uuid
from google.genai.types import Content
from google.genai.types import Part
from typing_extensions import override
from ..agents.base_agent import BaseAgent
@@ -51,6 +49,7 @@ from .eval_sets_manager import EvalSetsManager
from .evaluation_generator import EvaluationGenerator
from .evaluator import EvalStatus
from .evaluator import EvaluationResult
from .evaluator import PerInvocationResult
from .metric_evaluator_registry import DEFAULT_METRIC_EVALUATOR_REGISTRY
from .metric_evaluator_registry import MetricEvaluatorRegistry
from .user_simulator_provider import UserSimulatorProvider
@@ -222,43 +221,9 @@ class LocalEvalService(BaseEvalService):
else 'test_user_id'
)
if eval_case.conversation_scenario:
logger.warning(
'Skipping evaluation of variable-length conversation scenario in eval'
' set/case %s/%s.',
inference_result.eval_set_id,
inference_result.eval_case_id,
)
for actual_invocation in inference_result.inferences:
eval_metric_result_per_invocation.append(
EvalMetricResultPerInvocation(
actual_invocation=actual_invocation,
expected_invocation=Invocation(
user_content=actual_invocation.user_content,
final_response=Content(
parts=[Part(text='N/A')], role='model'
),
),
)
)
eval_case_result = EvalCaseResult(
eval_set_file=inference_result.eval_set_id,
eval_set_id=inference_result.eval_set_id,
eval_id=inference_result.eval_case_id,
final_eval_status=EvalStatus.NOT_EVALUATED,
overall_eval_metric_results=overall_eval_metric_results,
eval_metric_result_per_invocation=eval_metric_result_per_invocation,
session_id=inference_result.session_id,
session_details=await self._session_service.get_session(
app_name=inference_result.app_name,
user_id=user_id,
session_id=inference_result.session_id,
),
user_id=user_id,
)
return (inference_result, eval_case_result)
if len(inference_result.inferences) != len(eval_case.conversation):
if eval_case.conversation_scenario is None and len(
inference_result.inferences
) != len(eval_case.conversation):
raise ValueError(
'Inferences should match conversations in eval case. Found'
f'{len(inference_result.inferences)} inferences '
@@ -266,13 +231,13 @@ class LocalEvalService(BaseEvalService):
)
# Pre-creating the EvalMetricResults entries for each invocation.
for actual, expected in zip(
inference_result.inferences, eval_case.conversation
):
for idx, actual in enumerate(inference_result.inferences):
eval_metric_result_per_invocation.append(
EvalMetricResultPerInvocation(
actual_invocation=actual,
expected_invocation=expected,
expected_invocation=eval_case.conversation[idx]
if eval_case.conversation
else None,
# We will fill this as we evaluate each metric per invocation.
eval_metric_results=[],
)
@@ -280,11 +245,27 @@ class LocalEvalService(BaseEvalService):
for eval_metric in evaluate_config.eval_metrics:
# Perform evaluation of the metric.
evaluation_result = await self._evaluate_metric(
eval_metric=eval_metric,
actual_invocations=inference_result.inferences,
expected_invocations=eval_case.conversation,
)
try:
evaluation_result = await self._evaluate_metric(
eval_metric=eval_metric,
actual_invocations=inference_result.inferences,
expected_invocations=eval_case.conversation,
)
except Exception as e:
# We intentionally catch the Exception as we don't want failures to
# affect other metric evaluation.
logger.error(
"Metric evaluation failed for metric `%s` for eval case id '%s'"
' with following error `%s`',
eval_metric.metric_name,
eval_case.eval_id,
e,
exc_info=True,
)
# We use an empty result.
evaluation_result = EvaluationResult(
overall_eval_status=EvalStatus.NOT_EVALUATED
)
# Track overall scrore across all invocations.
eval_metric_result_details = EvalMetricResultDetails(
@@ -299,8 +280,10 @@ class LocalEvalService(BaseEvalService):
)
)
if len(evaluation_result.per_invocation_results) != len(
eval_metric_result_per_invocation
if (
evaluation_result.overall_eval_status != EvalStatus.NOT_EVALUATED
and len(evaluation_result.per_invocation_results)
!= len(eval_metric_result_per_invocation)
):
raise ValueError(
'Eval metric should return results for each invocation. Found '
@@ -309,10 +292,14 @@ class LocalEvalService(BaseEvalService):
)
# Track score across individual invocations.
for invocation_result, invocation in zip(
evaluation_result.per_invocation_results,
eval_metric_result_per_invocation,
):
for idx, invocation in enumerate(eval_metric_result_per_invocation):
invocation_result = (
evaluation_result.per_invocation_results[idx]
if evaluation_result.overall_eval_status != EvalStatus.NOT_EVALUATED
else PerInvocationResult(
actual_invocation=invocation.actual_invocation
)
)
eval_metric_result_details = EvalMetricResultDetails(
rubric_scores=invocation_result.rubric_scores
)
@@ -351,7 +338,7 @@ class LocalEvalService(BaseEvalService):
self,
eval_metric: EvalMetric,
actual_invocations: list[Invocation],
expected_invocations: list[Invocation],
expected_invocations: Optional[list[Invocation]],
) -> EvaluationResult:
"""Returns EvaluationResult obtained from evaluating a metric using an Evaluator."""
@@ -100,7 +100,7 @@ class ResponseEvaluator(Evaluator):
def evaluate_invocations(
self,
actual_invocations: list[Invocation],
expected_invocations: list[Invocation],
expected_invocations: Optional[list[Invocation]],
) -> EvaluationResult:
# If the metric is response_match_score, just use the RougeEvaluator.
if self._metric_name == PrebuiltMetrics.RESPONSE_MATCH_SCORE.value:
@@ -112,5 +112,7 @@ class ResponseEvaluator(Evaluator):
)
return _VertexAiEvalFacade(
threshold=self._threshold, metric_name=self._metric_name
threshold=self._threshold,
metric_name=self._metric_name,
expected_invocations_required=True,
).evaluate_invocations(actual_invocations, expected_invocations)
@@ -16,6 +16,7 @@ from __future__ import annotations
import logging
from typing import ClassVar
from typing import Optional
from typing_extensions import override
@@ -281,7 +282,7 @@ class RubricBasedFinalResponseQualityV1Evaluator(RubricBasedEvaluator):
@override
def format_auto_rater_prompt(
self, actual_invocation: Invocation, _: Invocation
self, actual_invocation: Invocation, _: Optional[Invocation]
) -> str:
"""Returns the autorater prompt."""
@@ -16,6 +16,7 @@ from __future__ import annotations
import logging
from typing import ClassVar
from typing import Optional
from typing_extensions import override
@@ -181,7 +182,7 @@ class RubricBasedToolUseV1Evaluator(RubricBasedEvaluator):
@override
def format_auto_rater_prompt(
self, actual_invocation: Invocation, _: Invocation
self, actual_invocation: Invocation, _: Optional[Invocation]
) -> str:
"""Returns the autorater prompt."""
@@ -14,6 +14,8 @@
from __future__ import annotations
from typing import Optional
from typing_extensions import override
from ..dependencies.vertexai import vertexai
@@ -66,7 +68,7 @@ class SafetyEvaluatorV1(Evaluator):
def evaluate_invocations(
self,
actual_invocations: list[Invocation],
expected_invocations: list[Invocation],
expected_invocations: Optional[list[Invocation]],
) -> EvaluationResult:
return _VertexAiEvalFacade(
threshold=self._eval_metric.threshold,
@@ -71,9 +71,12 @@ class TrajectoryEvaluator(Evaluator):
def evaluate_invocations(
self,
actual_invocations: list[Invocation],
expected_invocations: list[Invocation],
expected_invocations: Optional[list[Invocation]],
) -> EvaluationResult:
"""Returns EvaluationResult after performing evaluations using actual and expected invocations."""
if expected_invocations is None:
raise ValueError("expected_invocations is needed by this metric.")
total_tool_use_accuracy = 0.0
num_invocations = 0
per_invocation_results = []
@@ -55,23 +55,38 @@ class _VertexAiEvalFacade(Evaluator):
"""
def __init__(
self, threshold: float, metric_name: vertexai_types.PrebuiltMetric
self,
threshold: float,
metric_name: vertexai_types.PrebuiltMetric,
expected_invocations_required=False,
):
self._threshold = threshold
self._metric_name = metric_name
self._expected_invocations_required = expected_invocations_required
@override
def evaluate_invocations(
self,
actual_invocations: list[Invocation],
expected_invocations: list[Invocation],
expected_invocations: Optional[list[Invocation]],
) -> EvaluationResult:
if self._expected_invocations_required and expected_invocations is None:
raise ValueError("expected_invocations is needed by this metric.")
# If expected_invocation are not required by the metric and if they are not
# supplied, we provide an a list of None.
expected_invocations = (
[None] * len(actual_invocations)
if expected_invocations is None
else expected_invocations
)
total_score = 0.0
num_invocations = 0
per_invocation_results = []
for actual, expected in zip(actual_invocations, expected_invocations):
prompt = self._get_text(expected.user_content)
reference = self._get_text(expected.final_response)
prompt = self._get_text(actual.user_content)
reference = self._get_text(expected.final_response) if expected else None
response = self._get_text(actual.final_response)
eval_case = {
"prompt": prompt,
@@ -16,6 +16,7 @@ from __future__ import annotations
import asyncio
import sys
from typing import Optional
from google.adk.agents.llm_agent import LlmAgent
from google.adk.errors.not_found_error import NotFoundError
@@ -70,6 +71,10 @@ def eval_service(
DEFAULT_METRIC_EVALUATOR_REGISTRY.register_evaluator(
metric_info=FakeEvaluator.get_metric_info(), evaluator=FakeEvaluator
)
DEFAULT_METRIC_EVALUATOR_REGISTRY.register_evaluator(
metric_info=FakeSingleSidedEvaluator.get_metric_info(),
evaluator=FakeSingleSidedEvaluator,
)
return LocalEvalService(
root_agent=dummy_agent,
eval_sets_manager=mock_eval_sets_manager,
@@ -95,8 +100,10 @@ class FakeEvaluator(Evaluator):
def evaluate_invocations(
self,
actual_invocations: list[Invocation],
expected_invocations: list[Invocation],
expected_invocations: Optional[list[Invocation]],
):
if expected_invocations is None:
raise ValueError("expected_invocations is required for this metric.")
per_invocation_results = []
for actual, expected in zip(actual_invocations, expected_invocations):
per_invocation_results.append(
@@ -114,6 +121,42 @@ class FakeEvaluator(Evaluator):
)
class FakeSingleSidedEvaluator(Evaluator):
def __init__(self, eval_metric: EvalMetric):
self._eval_metric = eval_metric
@staticmethod
def get_metric_info() -> MetricInfo:
return MetricInfo(
metric_name="fake_single_sided_metric",
description="Fake single sided metric description",
metric_value_info=MetricValueInfo(
interval=Interval(min_value=0.0, max_value=1.0)
),
)
def evaluate_invocations(
self,
actual_invocations: list[Invocation],
expected_invocations: Optional[list[Invocation]],
):
per_invocation_results = []
for actual in actual_invocations:
per_invocation_results.append(
PerInvocationResult(
actual_invocation=actual,
score=0.995,
eval_status=EvalStatus.PASSED,
)
)
return EvaluationResult(
overall_score=0.95,
overall_eval_status=EvalStatus.PASSED,
per_invocation_results=per_invocation_results,
)
@pytest.mark.asyncio
async def test_perform_inference_success(
eval_service,
@@ -224,19 +267,27 @@ async def test_perform_inference_eval_set_not_found(
async def test_evaluate_success(
eval_service, mock_eval_sets_manager, mock_eval_set_results_manager, mocker
):
invocation = Invocation(
user_content=genai_types.Content(
parts=[genai_types.Part(text="test user content.")]
),
final_response=genai_types.Content(
parts=[genai_types.Part(text="test final response.")]
),
)
inference_results = [
InferenceResult(
app_name="test_app",
eval_set_id="test_eval_set",
eval_case_id="case1",
inferences=[],
inferences=[invocation.model_copy(deep=True)],
session_id="session1",
),
InferenceResult(
app_name="test_app",
eval_set_id="test_eval_set",
eval_case_id="case2",
inferences=[],
inferences=[invocation.model_copy(deep=True)],
session_id="session2",
),
]
@@ -247,7 +298,7 @@ async def test_evaluate_success(
)
mock_eval_case = mocker.MagicMock(spec=EvalCase)
mock_eval_case.conversation = []
mock_eval_case.conversation = [invocation.model_copy(deep=True)]
mock_eval_case.conversation_scenario = None
mock_eval_case.session_input = None
mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case
@@ -357,7 +408,7 @@ async def test_evaluate_single_inference_result(
@pytest.mark.asyncio
async def test_evaluate_single_inference_result_skipped_for_conversation_scenario(
async def test_evaluate_single_inference_result_for_conversation_scenario(
eval_service, mock_eval_sets_manager, mocker
):
"""To be removed once evaluation is implemented for conversation scenarios."""
@@ -373,10 +424,16 @@ async def test_evaluate_single_inference_result_skipped_for_conversation_scenari
app_name="test_app",
eval_set_id="test_eval_set",
eval_case_id="case1",
inferences=[invocation.model_copy(deep=True)],
inferences=[
invocation.model_copy(deep=True),
invocation.model_copy(deep=True),
invocation.model_copy(deep=True),
],
session_id="session1",
)
eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5)
eval_metric = EvalMetric(
metric_name="fake_single_sided_metric", threshold=0.5
)
evaluate_config = EvaluateConfig(eval_metrics=[eval_metric], parallelism=1)
mock_eval_case = mocker.MagicMock(spec=EvalCase)
@@ -390,15 +447,77 @@ async def test_evaluate_single_inference_result_skipped_for_conversation_scenari
)
assert isinstance(result, EvalCaseResult)
assert result.eval_id == "case1"
assert result.final_eval_status == EvalStatus.NOT_EVALUATED
assert not result.overall_eval_metric_results
assert len(result.eval_metric_result_per_invocation) == 1
invocation_result = result.eval_metric_result_per_invocation[0]
assert not invocation_result.eval_metric_results
assert result.final_eval_status == EvalStatus.PASSED
assert len(result.overall_eval_metric_results) == 1
assert (
invocation_result.expected_invocation.final_response.parts[0].text
== "N/A"
result.overall_eval_metric_results[0].metric_name
== "fake_single_sided_metric"
)
assert result.overall_eval_metric_results[0].score == 0.95
mock_eval_sets_manager.get_eval_case.assert_called_once_with(
app_name="test_app", eval_set_id="test_eval_set", eval_case_id="case1"
)
assert len(result.eval_metric_result_per_invocation) == 3
for i in range(3):
invocation_result = result.eval_metric_result_per_invocation[i]
assert invocation_result.actual_invocation == inference_result.inferences[i]
assert invocation_result.expected_invocation == None
assert len(invocation_result.eval_metric_results) == 1
metric_result = invocation_result.eval_metric_results[0]
assert metric_result.metric_name == "fake_single_sided_metric"
assert metric_result.score == 0.995
assert metric_result.eval_status == EvalStatus.PASSED
@pytest.mark.asyncio
async def test_evaluate_single_inference_result_for_conversation_scenario_with_unsupported_metric(
eval_service, mock_eval_sets_manager, mocker
):
"""To be removed once evaluation is implemented for conversation scenarios."""
invocation = Invocation(
user_content=genai_types.Content(
parts=[genai_types.Part(text="test user content.")]
),
final_response=genai_types.Content(
parts=[genai_types.Part(text="test final response.")]
),
)
inference_result = InferenceResult(
app_name="test_app",
eval_set_id="test_eval_set",
eval_case_id="case1",
inferences=[
invocation.model_copy(deep=True),
invocation.model_copy(deep=True),
invocation.model_copy(deep=True),
],
session_id="session1",
)
eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5)
evaluate_config = EvaluateConfig(eval_metrics=[eval_metric], parallelism=1)
mock_eval_case = mocker.MagicMock(spec=EvalCase)
mock_eval_case.eval_id = "case1"
mock_eval_case.conversation = None
mock_eval_case.conversation_scenario = mocker.MagicMock()
mock_eval_case.session_input = None
mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case
_, result = await eval_service._evaluate_single_inference_result(
inference_result=inference_result, evaluate_config=evaluate_config
)
assert isinstance(result, EvalCaseResult)
assert result.eval_id == "case1"
assert result.final_eval_status == EvalStatus.NOT_EVALUATED
assert len(result.overall_eval_metric_results) == 1
assert result.overall_eval_metric_results[0].metric_name == "fake_metric"
assert result.overall_eval_metric_results[0].score is None
mock_eval_sets_manager.get_eval_case.assert_called_once_with(
app_name="test_app", eval_set_id="test_eval_set", eval_case_id="case1"
)
assert len(result.eval_metric_result_per_invocation) == 3
def test_generate_final_eval_status_doesn_t_throw_on(eval_service):
@@ -424,9 +543,11 @@ async def test_mcp_stdio_agent_no_runtime_error(mocker):
"""Test that LocalEvalService can handle MCP stdio agents without RuntimeError.
This is a regression test for GitHub issue #2196:
"RuntimeError: Attempted to exit cancel scope in a different task than it was entered in"
"RuntimeError: Attempted to exit cancel scope in a different task than it was
entered in"
The fix ensures that Runner.close() is called to properly cleanup MCP connections.
The fix ensures that Runner.close() is called to properly cleanup MCP
connections.
"""
import tempfile