You've already forked adk-python
mirror of
https://github.com/encounter/adk-python.git
synced 2026-03-30 10:57:20 -07:00
chore: Marked expected_invocation as optional field on evaluator interface
ADK already has a set of metrics that don't rely expected_invocations. Also, for eval cases with conversation scenario, this would be the main line case. PiperOrigin-RevId: 825101481
This commit is contained in:
committed by
Copybara-Service
parent
9ab17f2afd
commit
b17c8f19e5
@@ -210,21 +210,23 @@ def pretty_print_eval_result(eval_result: EvalCaseResult):
|
||||
|
||||
data = []
|
||||
for per_invocation_result in eval_result.eval_metric_result_per_invocation:
|
||||
actual_invocation = per_invocation_result.actual_invocation
|
||||
expected_invocation = per_invocation_result.expected_invocation
|
||||
row_data = {
|
||||
"prompt": _convert_content_to_text(
|
||||
per_invocation_result.expected_invocation.user_content
|
||||
),
|
||||
"prompt": _convert_content_to_text(actual_invocation.user_content),
|
||||
"expected_response": _convert_content_to_text(
|
||||
per_invocation_result.expected_invocation.final_response
|
||||
expected_invocation.final_response if expected_invocation else None
|
||||
),
|
||||
"actual_response": _convert_content_to_text(
|
||||
per_invocation_result.actual_invocation.final_response
|
||||
actual_invocation.final_response
|
||||
),
|
||||
"expected_tool_calls": _convert_tool_calls_to_text(
|
||||
per_invocation_result.expected_invocation.intermediate_data
|
||||
expected_invocation.intermediate_data
|
||||
if expected_invocation
|
||||
else None
|
||||
),
|
||||
"actual_tool_calls": _convert_tool_calls_to_text(
|
||||
per_invocation_result.actual_invocation.intermediate_data
|
||||
actual_invocation.intermediate_data
|
||||
),
|
||||
}
|
||||
for metric_result in per_invocation_result.eval_metric_results:
|
||||
|
||||
@@ -216,10 +216,11 @@ class EvalMetricResultPerInvocation(EvalBaseModel):
|
||||
)
|
||||
)
|
||||
|
||||
expected_invocation: Invocation = Field(
|
||||
expected_invocation: Optional[Invocation] = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"The expected invocation, usually the reference or golden invocation."
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
eval_metric_results: list[EvalMetricResult] = Field(
|
||||
|
||||
@@ -33,7 +33,7 @@ class PerInvocationResult(BaseModel):
|
||||
"""Metric evaluation score per invocation."""
|
||||
|
||||
actual_invocation: Invocation
|
||||
expected_invocation: Invocation
|
||||
expected_invocation: Optional[Invocation] = None
|
||||
score: Optional[float] = None
|
||||
eval_status: EvalStatus = EvalStatus.NOT_EVALUATED
|
||||
rubric_scores: Optional[list[RubricScore]] = None
|
||||
@@ -61,7 +61,16 @@ class Evaluator(ABC):
|
||||
def evaluate_invocations(
|
||||
self,
|
||||
actual_invocations: list[Invocation],
|
||||
expected_invocations: list[Invocation],
|
||||
expected_invocations: Optional[list[Invocation]],
|
||||
) -> EvaluationResult:
|
||||
"""Returns EvaluationResult after performing evaluations using actual and expected invocations."""
|
||||
"""Returns EvaluationResult after performing evaluations using actual and expected invocations.
|
||||
|
||||
Args:
|
||||
actual_invocations: These are the invocations that are obtained from the
|
||||
agent under test.
|
||||
expected_invocations: An optional list of invocations, if specified,
|
||||
usually act as a benchmark/golden response. If these are specified
|
||||
usually the expectation is that the length of this list and actual
|
||||
invocaiton is the same.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
@@ -59,8 +59,11 @@ class RougeEvaluator(Evaluator):
|
||||
def evaluate_invocations(
|
||||
self,
|
||||
actual_invocations: list[Invocation],
|
||||
expected_invocations: list[Invocation],
|
||||
expected_invocations: Optional[list[Invocation]],
|
||||
) -> EvaluationResult:
|
||||
if expected_invocations is None:
|
||||
raise ValueError("expected_invocations is required for this metric.")
|
||||
|
||||
total_score = 0.0
|
||||
num_invocations = 0
|
||||
per_invocation_results = []
|
||||
|
||||
@@ -147,7 +147,11 @@ class FinalResponseMatchV2Evaluator(LlmAsJudge):
|
||||
self,
|
||||
eval_metric: EvalMetric,
|
||||
):
|
||||
super().__init__(eval_metric, FinalResponseMatchV2Evaluator.criterion_type)
|
||||
super().__init__(
|
||||
eval_metric,
|
||||
FinalResponseMatchV2Evaluator.criterion_type,
|
||||
expected_invocations_required=True,
|
||||
)
|
||||
self._auto_rater_prompt_template = _FINAL_RESPONSE_MATCH_V2_PROMPT
|
||||
|
||||
@staticmethod
|
||||
@@ -166,8 +170,13 @@ class FinalResponseMatchV2Evaluator(LlmAsJudge):
|
||||
|
||||
@override
|
||||
def format_auto_rater_prompt(
|
||||
self, actual_invocation: Invocation, expected_invocation: Invocation
|
||||
self,
|
||||
actual_invocation: Invocation,
|
||||
expected_invocation: Optional[Invocation],
|
||||
) -> str:
|
||||
if expected_invocation is None:
|
||||
raise ValueError("expected_invocation is required for this metric.")
|
||||
|
||||
reference = get_text_from_content(expected_invocation.final_response)
|
||||
response = get_text_from_content(actual_invocation.final_response)
|
||||
user_prompt = get_text_from_content(expected_invocation.user_content)
|
||||
|
||||
@@ -395,7 +395,8 @@ class HallucinationsV1Evaluator(Evaluator):
|
||||
},
|
||||
{
|
||||
"name": "get_weather",
|
||||
"description": '''Gets the weather of the given place at the given time.
|
||||
"description": '''Gets the weather of the given place at the given
|
||||
time.
|
||||
|
||||
Args:
|
||||
location: The location for which to retrieve weather information.
|
||||
@@ -408,7 +409,8 @@ class HallucinationsV1Evaluator(Evaluator):
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {
|
||||
"description": "The location for which to retrieve weather information.",
|
||||
"description": "The location for which to retrieve weather
|
||||
information.",
|
||||
"type": "string"
|
||||
},
|
||||
"time": {
|
||||
@@ -711,8 +713,15 @@ class HallucinationsV1Evaluator(Evaluator):
|
||||
async def evaluate_invocations(
|
||||
self,
|
||||
actual_invocations: list[Invocation],
|
||||
expected_invocations: list[Invocation],
|
||||
expected_invocations: Optional[list[Invocation]],
|
||||
) -> EvaluationResult:
|
||||
# expected_invocations are not required by the metric and if they are not
|
||||
# supplied, we provide an a list of None to rest of the code.
|
||||
expected_invocations = (
|
||||
[None] * len(actual_invocations)
|
||||
if expected_invocations is None
|
||||
else expected_invocations
|
||||
)
|
||||
per_invocation_results = []
|
||||
for actual, expected in zip(actual_invocations, expected_invocations):
|
||||
step_evaluations = self._get_steps_to_evaluate(actual)
|
||||
|
||||
@@ -60,9 +60,13 @@ class LlmAsJudge(Evaluator):
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, eval_metric: EvalMetric, criterion_type: type[BaseCriterion]
|
||||
self,
|
||||
eval_metric: EvalMetric,
|
||||
criterion_type: type[BaseCriterion],
|
||||
expected_invocations_required=False,
|
||||
):
|
||||
self._eval_metric = eval_metric
|
||||
self._expected_invocations_required = expected_invocations_required
|
||||
|
||||
expected_criterion_type_error = ValueError(
|
||||
f"`{eval_metric.metric_name}` metric expects a criterion of type"
|
||||
@@ -84,7 +88,7 @@ class LlmAsJudge(Evaluator):
|
||||
|
||||
@abstractmethod
|
||||
def format_auto_rater_prompt(
|
||||
self, actual: Invocation, expected: Invocation
|
||||
self, actual: Invocation, expected: Optional[Invocation]
|
||||
) -> str:
|
||||
"""Formats the auto-rater prompt to evaluate the given invocation."""
|
||||
|
||||
@@ -112,8 +116,19 @@ class LlmAsJudge(Evaluator):
|
||||
async def evaluate_invocations(
|
||||
self,
|
||||
actual_invocations: list[Invocation],
|
||||
expected_invocations: list[Invocation],
|
||||
expected_invocations: Optional[list[Invocation]],
|
||||
) -> EvaluationResult:
|
||||
if self._expected_invocations_required and expected_invocations is None:
|
||||
raise ValueError("expected_invocations is needed by this metric.")
|
||||
|
||||
# If expected_invocation are not required by the metric and if they are not
|
||||
# supplied, we provide an a list of None.
|
||||
expected_invocations = (
|
||||
[None] * len(actual_invocations)
|
||||
if expected_invocations is None
|
||||
else expected_invocations
|
||||
)
|
||||
|
||||
per_invocation_results = []
|
||||
for actual, expected in zip(actual_invocations, expected_invocations):
|
||||
auto_rater_prompt = self.format_auto_rater_prompt(actual, expected)
|
||||
|
||||
@@ -22,8 +22,6 @@ from typing import Callable
|
||||
from typing import Optional
|
||||
import uuid
|
||||
|
||||
from google.genai.types import Content
|
||||
from google.genai.types import Part
|
||||
from typing_extensions import override
|
||||
|
||||
from ..agents.base_agent import BaseAgent
|
||||
@@ -51,6 +49,7 @@ from .eval_sets_manager import EvalSetsManager
|
||||
from .evaluation_generator import EvaluationGenerator
|
||||
from .evaluator import EvalStatus
|
||||
from .evaluator import EvaluationResult
|
||||
from .evaluator import PerInvocationResult
|
||||
from .metric_evaluator_registry import DEFAULT_METRIC_EVALUATOR_REGISTRY
|
||||
from .metric_evaluator_registry import MetricEvaluatorRegistry
|
||||
from .user_simulator_provider import UserSimulatorProvider
|
||||
@@ -222,43 +221,9 @@ class LocalEvalService(BaseEvalService):
|
||||
else 'test_user_id'
|
||||
)
|
||||
|
||||
if eval_case.conversation_scenario:
|
||||
logger.warning(
|
||||
'Skipping evaluation of variable-length conversation scenario in eval'
|
||||
' set/case %s/%s.',
|
||||
inference_result.eval_set_id,
|
||||
inference_result.eval_case_id,
|
||||
)
|
||||
for actual_invocation in inference_result.inferences:
|
||||
eval_metric_result_per_invocation.append(
|
||||
EvalMetricResultPerInvocation(
|
||||
actual_invocation=actual_invocation,
|
||||
expected_invocation=Invocation(
|
||||
user_content=actual_invocation.user_content,
|
||||
final_response=Content(
|
||||
parts=[Part(text='N/A')], role='model'
|
||||
),
|
||||
),
|
||||
)
|
||||
)
|
||||
eval_case_result = EvalCaseResult(
|
||||
eval_set_file=inference_result.eval_set_id,
|
||||
eval_set_id=inference_result.eval_set_id,
|
||||
eval_id=inference_result.eval_case_id,
|
||||
final_eval_status=EvalStatus.NOT_EVALUATED,
|
||||
overall_eval_metric_results=overall_eval_metric_results,
|
||||
eval_metric_result_per_invocation=eval_metric_result_per_invocation,
|
||||
session_id=inference_result.session_id,
|
||||
session_details=await self._session_service.get_session(
|
||||
app_name=inference_result.app_name,
|
||||
user_id=user_id,
|
||||
session_id=inference_result.session_id,
|
||||
),
|
||||
user_id=user_id,
|
||||
)
|
||||
return (inference_result, eval_case_result)
|
||||
|
||||
if len(inference_result.inferences) != len(eval_case.conversation):
|
||||
if eval_case.conversation_scenario is None and len(
|
||||
inference_result.inferences
|
||||
) != len(eval_case.conversation):
|
||||
raise ValueError(
|
||||
'Inferences should match conversations in eval case. Found'
|
||||
f'{len(inference_result.inferences)} inferences '
|
||||
@@ -266,13 +231,13 @@ class LocalEvalService(BaseEvalService):
|
||||
)
|
||||
|
||||
# Pre-creating the EvalMetricResults entries for each invocation.
|
||||
for actual, expected in zip(
|
||||
inference_result.inferences, eval_case.conversation
|
||||
):
|
||||
for idx, actual in enumerate(inference_result.inferences):
|
||||
eval_metric_result_per_invocation.append(
|
||||
EvalMetricResultPerInvocation(
|
||||
actual_invocation=actual,
|
||||
expected_invocation=expected,
|
||||
expected_invocation=eval_case.conversation[idx]
|
||||
if eval_case.conversation
|
||||
else None,
|
||||
# We will fill this as we evaluate each metric per invocation.
|
||||
eval_metric_results=[],
|
||||
)
|
||||
@@ -280,11 +245,27 @@ class LocalEvalService(BaseEvalService):
|
||||
|
||||
for eval_metric in evaluate_config.eval_metrics:
|
||||
# Perform evaluation of the metric.
|
||||
evaluation_result = await self._evaluate_metric(
|
||||
eval_metric=eval_metric,
|
||||
actual_invocations=inference_result.inferences,
|
||||
expected_invocations=eval_case.conversation,
|
||||
)
|
||||
try:
|
||||
evaluation_result = await self._evaluate_metric(
|
||||
eval_metric=eval_metric,
|
||||
actual_invocations=inference_result.inferences,
|
||||
expected_invocations=eval_case.conversation,
|
||||
)
|
||||
except Exception as e:
|
||||
# We intentionally catch the Exception as we don't want failures to
|
||||
# affect other metric evaluation.
|
||||
logger.error(
|
||||
"Metric evaluation failed for metric `%s` for eval case id '%s'"
|
||||
' with following error `%s`',
|
||||
eval_metric.metric_name,
|
||||
eval_case.eval_id,
|
||||
e,
|
||||
exc_info=True,
|
||||
)
|
||||
# We use an empty result.
|
||||
evaluation_result = EvaluationResult(
|
||||
overall_eval_status=EvalStatus.NOT_EVALUATED
|
||||
)
|
||||
|
||||
# Track overall scrore across all invocations.
|
||||
eval_metric_result_details = EvalMetricResultDetails(
|
||||
@@ -299,8 +280,10 @@ class LocalEvalService(BaseEvalService):
|
||||
)
|
||||
)
|
||||
|
||||
if len(evaluation_result.per_invocation_results) != len(
|
||||
eval_metric_result_per_invocation
|
||||
if (
|
||||
evaluation_result.overall_eval_status != EvalStatus.NOT_EVALUATED
|
||||
and len(evaluation_result.per_invocation_results)
|
||||
!= len(eval_metric_result_per_invocation)
|
||||
):
|
||||
raise ValueError(
|
||||
'Eval metric should return results for each invocation. Found '
|
||||
@@ -309,10 +292,14 @@ class LocalEvalService(BaseEvalService):
|
||||
)
|
||||
|
||||
# Track score across individual invocations.
|
||||
for invocation_result, invocation in zip(
|
||||
evaluation_result.per_invocation_results,
|
||||
eval_metric_result_per_invocation,
|
||||
):
|
||||
for idx, invocation in enumerate(eval_metric_result_per_invocation):
|
||||
invocation_result = (
|
||||
evaluation_result.per_invocation_results[idx]
|
||||
if evaluation_result.overall_eval_status != EvalStatus.NOT_EVALUATED
|
||||
else PerInvocationResult(
|
||||
actual_invocation=invocation.actual_invocation
|
||||
)
|
||||
)
|
||||
eval_metric_result_details = EvalMetricResultDetails(
|
||||
rubric_scores=invocation_result.rubric_scores
|
||||
)
|
||||
@@ -351,7 +338,7 @@ class LocalEvalService(BaseEvalService):
|
||||
self,
|
||||
eval_metric: EvalMetric,
|
||||
actual_invocations: list[Invocation],
|
||||
expected_invocations: list[Invocation],
|
||||
expected_invocations: Optional[list[Invocation]],
|
||||
) -> EvaluationResult:
|
||||
"""Returns EvaluationResult obtained from evaluating a metric using an Evaluator."""
|
||||
|
||||
|
||||
@@ -100,7 +100,7 @@ class ResponseEvaluator(Evaluator):
|
||||
def evaluate_invocations(
|
||||
self,
|
||||
actual_invocations: list[Invocation],
|
||||
expected_invocations: list[Invocation],
|
||||
expected_invocations: Optional[list[Invocation]],
|
||||
) -> EvaluationResult:
|
||||
# If the metric is response_match_score, just use the RougeEvaluator.
|
||||
if self._metric_name == PrebuiltMetrics.RESPONSE_MATCH_SCORE.value:
|
||||
@@ -112,5 +112,7 @@ class ResponseEvaluator(Evaluator):
|
||||
)
|
||||
|
||||
return _VertexAiEvalFacade(
|
||||
threshold=self._threshold, metric_name=self._metric_name
|
||||
threshold=self._threshold,
|
||||
metric_name=self._metric_name,
|
||||
expected_invocations_required=True,
|
||||
).evaluate_invocations(actual_invocations, expected_invocations)
|
||||
|
||||
@@ -16,6 +16,7 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import ClassVar
|
||||
from typing import Optional
|
||||
|
||||
from typing_extensions import override
|
||||
|
||||
@@ -281,7 +282,7 @@ class RubricBasedFinalResponseQualityV1Evaluator(RubricBasedEvaluator):
|
||||
|
||||
@override
|
||||
def format_auto_rater_prompt(
|
||||
self, actual_invocation: Invocation, _: Invocation
|
||||
self, actual_invocation: Invocation, _: Optional[Invocation]
|
||||
) -> str:
|
||||
"""Returns the autorater prompt."""
|
||||
|
||||
|
||||
@@ -16,6 +16,7 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import ClassVar
|
||||
from typing import Optional
|
||||
|
||||
from typing_extensions import override
|
||||
|
||||
@@ -181,7 +182,7 @@ class RubricBasedToolUseV1Evaluator(RubricBasedEvaluator):
|
||||
|
||||
@override
|
||||
def format_auto_rater_prompt(
|
||||
self, actual_invocation: Invocation, _: Invocation
|
||||
self, actual_invocation: Invocation, _: Optional[Invocation]
|
||||
) -> str:
|
||||
"""Returns the autorater prompt."""
|
||||
|
||||
|
||||
@@ -14,6 +14,8 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from typing_extensions import override
|
||||
|
||||
from ..dependencies.vertexai import vertexai
|
||||
@@ -66,7 +68,7 @@ class SafetyEvaluatorV1(Evaluator):
|
||||
def evaluate_invocations(
|
||||
self,
|
||||
actual_invocations: list[Invocation],
|
||||
expected_invocations: list[Invocation],
|
||||
expected_invocations: Optional[list[Invocation]],
|
||||
) -> EvaluationResult:
|
||||
return _VertexAiEvalFacade(
|
||||
threshold=self._eval_metric.threshold,
|
||||
|
||||
@@ -71,9 +71,12 @@ class TrajectoryEvaluator(Evaluator):
|
||||
def evaluate_invocations(
|
||||
self,
|
||||
actual_invocations: list[Invocation],
|
||||
expected_invocations: list[Invocation],
|
||||
expected_invocations: Optional[list[Invocation]],
|
||||
) -> EvaluationResult:
|
||||
"""Returns EvaluationResult after performing evaluations using actual and expected invocations."""
|
||||
if expected_invocations is None:
|
||||
raise ValueError("expected_invocations is needed by this metric.")
|
||||
|
||||
total_tool_use_accuracy = 0.0
|
||||
num_invocations = 0
|
||||
per_invocation_results = []
|
||||
|
||||
@@ -55,23 +55,38 @@ class _VertexAiEvalFacade(Evaluator):
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, threshold: float, metric_name: vertexai_types.PrebuiltMetric
|
||||
self,
|
||||
threshold: float,
|
||||
metric_name: vertexai_types.PrebuiltMetric,
|
||||
expected_invocations_required=False,
|
||||
):
|
||||
self._threshold = threshold
|
||||
self._metric_name = metric_name
|
||||
self._expected_invocations_required = expected_invocations_required
|
||||
|
||||
@override
|
||||
def evaluate_invocations(
|
||||
self,
|
||||
actual_invocations: list[Invocation],
|
||||
expected_invocations: list[Invocation],
|
||||
expected_invocations: Optional[list[Invocation]],
|
||||
) -> EvaluationResult:
|
||||
if self._expected_invocations_required and expected_invocations is None:
|
||||
raise ValueError("expected_invocations is needed by this metric.")
|
||||
|
||||
# If expected_invocation are not required by the metric and if they are not
|
||||
# supplied, we provide an a list of None.
|
||||
expected_invocations = (
|
||||
[None] * len(actual_invocations)
|
||||
if expected_invocations is None
|
||||
else expected_invocations
|
||||
)
|
||||
|
||||
total_score = 0.0
|
||||
num_invocations = 0
|
||||
per_invocation_results = []
|
||||
for actual, expected in zip(actual_invocations, expected_invocations):
|
||||
prompt = self._get_text(expected.user_content)
|
||||
reference = self._get_text(expected.final_response)
|
||||
prompt = self._get_text(actual.user_content)
|
||||
reference = self._get_text(expected.final_response) if expected else None
|
||||
response = self._get_text(actual.final_response)
|
||||
eval_case = {
|
||||
"prompt": prompt,
|
||||
|
||||
@@ -16,6 +16,7 @@ from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
from google.adk.agents.llm_agent import LlmAgent
|
||||
from google.adk.errors.not_found_error import NotFoundError
|
||||
@@ -70,6 +71,10 @@ def eval_service(
|
||||
DEFAULT_METRIC_EVALUATOR_REGISTRY.register_evaluator(
|
||||
metric_info=FakeEvaluator.get_metric_info(), evaluator=FakeEvaluator
|
||||
)
|
||||
DEFAULT_METRIC_EVALUATOR_REGISTRY.register_evaluator(
|
||||
metric_info=FakeSingleSidedEvaluator.get_metric_info(),
|
||||
evaluator=FakeSingleSidedEvaluator,
|
||||
)
|
||||
return LocalEvalService(
|
||||
root_agent=dummy_agent,
|
||||
eval_sets_manager=mock_eval_sets_manager,
|
||||
@@ -95,8 +100,10 @@ class FakeEvaluator(Evaluator):
|
||||
def evaluate_invocations(
|
||||
self,
|
||||
actual_invocations: list[Invocation],
|
||||
expected_invocations: list[Invocation],
|
||||
expected_invocations: Optional[list[Invocation]],
|
||||
):
|
||||
if expected_invocations is None:
|
||||
raise ValueError("expected_invocations is required for this metric.")
|
||||
per_invocation_results = []
|
||||
for actual, expected in zip(actual_invocations, expected_invocations):
|
||||
per_invocation_results.append(
|
||||
@@ -114,6 +121,42 @@ class FakeEvaluator(Evaluator):
|
||||
)
|
||||
|
||||
|
||||
class FakeSingleSidedEvaluator(Evaluator):
|
||||
|
||||
def __init__(self, eval_metric: EvalMetric):
|
||||
self._eval_metric = eval_metric
|
||||
|
||||
@staticmethod
|
||||
def get_metric_info() -> MetricInfo:
|
||||
return MetricInfo(
|
||||
metric_name="fake_single_sided_metric",
|
||||
description="Fake single sided metric description",
|
||||
metric_value_info=MetricValueInfo(
|
||||
interval=Interval(min_value=0.0, max_value=1.0)
|
||||
),
|
||||
)
|
||||
|
||||
def evaluate_invocations(
|
||||
self,
|
||||
actual_invocations: list[Invocation],
|
||||
expected_invocations: Optional[list[Invocation]],
|
||||
):
|
||||
per_invocation_results = []
|
||||
for actual in actual_invocations:
|
||||
per_invocation_results.append(
|
||||
PerInvocationResult(
|
||||
actual_invocation=actual,
|
||||
score=0.995,
|
||||
eval_status=EvalStatus.PASSED,
|
||||
)
|
||||
)
|
||||
return EvaluationResult(
|
||||
overall_score=0.95,
|
||||
overall_eval_status=EvalStatus.PASSED,
|
||||
per_invocation_results=per_invocation_results,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_perform_inference_success(
|
||||
eval_service,
|
||||
@@ -224,19 +267,27 @@ async def test_perform_inference_eval_set_not_found(
|
||||
async def test_evaluate_success(
|
||||
eval_service, mock_eval_sets_manager, mock_eval_set_results_manager, mocker
|
||||
):
|
||||
invocation = Invocation(
|
||||
user_content=genai_types.Content(
|
||||
parts=[genai_types.Part(text="test user content.")]
|
||||
),
|
||||
final_response=genai_types.Content(
|
||||
parts=[genai_types.Part(text="test final response.")]
|
||||
),
|
||||
)
|
||||
inference_results = [
|
||||
InferenceResult(
|
||||
app_name="test_app",
|
||||
eval_set_id="test_eval_set",
|
||||
eval_case_id="case1",
|
||||
inferences=[],
|
||||
inferences=[invocation.model_copy(deep=True)],
|
||||
session_id="session1",
|
||||
),
|
||||
InferenceResult(
|
||||
app_name="test_app",
|
||||
eval_set_id="test_eval_set",
|
||||
eval_case_id="case2",
|
||||
inferences=[],
|
||||
inferences=[invocation.model_copy(deep=True)],
|
||||
session_id="session2",
|
||||
),
|
||||
]
|
||||
@@ -247,7 +298,7 @@ async def test_evaluate_success(
|
||||
)
|
||||
|
||||
mock_eval_case = mocker.MagicMock(spec=EvalCase)
|
||||
mock_eval_case.conversation = []
|
||||
mock_eval_case.conversation = [invocation.model_copy(deep=True)]
|
||||
mock_eval_case.conversation_scenario = None
|
||||
mock_eval_case.session_input = None
|
||||
mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case
|
||||
@@ -357,7 +408,7 @@ async def test_evaluate_single_inference_result(
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluate_single_inference_result_skipped_for_conversation_scenario(
|
||||
async def test_evaluate_single_inference_result_for_conversation_scenario(
|
||||
eval_service, mock_eval_sets_manager, mocker
|
||||
):
|
||||
"""To be removed once evaluation is implemented for conversation scenarios."""
|
||||
@@ -373,10 +424,16 @@ async def test_evaluate_single_inference_result_skipped_for_conversation_scenari
|
||||
app_name="test_app",
|
||||
eval_set_id="test_eval_set",
|
||||
eval_case_id="case1",
|
||||
inferences=[invocation.model_copy(deep=True)],
|
||||
inferences=[
|
||||
invocation.model_copy(deep=True),
|
||||
invocation.model_copy(deep=True),
|
||||
invocation.model_copy(deep=True),
|
||||
],
|
||||
session_id="session1",
|
||||
)
|
||||
eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5)
|
||||
eval_metric = EvalMetric(
|
||||
metric_name="fake_single_sided_metric", threshold=0.5
|
||||
)
|
||||
evaluate_config = EvaluateConfig(eval_metrics=[eval_metric], parallelism=1)
|
||||
|
||||
mock_eval_case = mocker.MagicMock(spec=EvalCase)
|
||||
@@ -390,15 +447,77 @@ async def test_evaluate_single_inference_result_skipped_for_conversation_scenari
|
||||
)
|
||||
assert isinstance(result, EvalCaseResult)
|
||||
assert result.eval_id == "case1"
|
||||
assert result.final_eval_status == EvalStatus.NOT_EVALUATED
|
||||
assert not result.overall_eval_metric_results
|
||||
assert len(result.eval_metric_result_per_invocation) == 1
|
||||
invocation_result = result.eval_metric_result_per_invocation[0]
|
||||
assert not invocation_result.eval_metric_results
|
||||
assert result.final_eval_status == EvalStatus.PASSED
|
||||
assert len(result.overall_eval_metric_results) == 1
|
||||
assert (
|
||||
invocation_result.expected_invocation.final_response.parts[0].text
|
||||
== "N/A"
|
||||
result.overall_eval_metric_results[0].metric_name
|
||||
== "fake_single_sided_metric"
|
||||
)
|
||||
assert result.overall_eval_metric_results[0].score == 0.95
|
||||
mock_eval_sets_manager.get_eval_case.assert_called_once_with(
|
||||
app_name="test_app", eval_set_id="test_eval_set", eval_case_id="case1"
|
||||
)
|
||||
|
||||
assert len(result.eval_metric_result_per_invocation) == 3
|
||||
for i in range(3):
|
||||
invocation_result = result.eval_metric_result_per_invocation[i]
|
||||
assert invocation_result.actual_invocation == inference_result.inferences[i]
|
||||
assert invocation_result.expected_invocation == None
|
||||
assert len(invocation_result.eval_metric_results) == 1
|
||||
metric_result = invocation_result.eval_metric_results[0]
|
||||
assert metric_result.metric_name == "fake_single_sided_metric"
|
||||
assert metric_result.score == 0.995
|
||||
assert metric_result.eval_status == EvalStatus.PASSED
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluate_single_inference_result_for_conversation_scenario_with_unsupported_metric(
|
||||
eval_service, mock_eval_sets_manager, mocker
|
||||
):
|
||||
"""To be removed once evaluation is implemented for conversation scenarios."""
|
||||
invocation = Invocation(
|
||||
user_content=genai_types.Content(
|
||||
parts=[genai_types.Part(text="test user content.")]
|
||||
),
|
||||
final_response=genai_types.Content(
|
||||
parts=[genai_types.Part(text="test final response.")]
|
||||
),
|
||||
)
|
||||
inference_result = InferenceResult(
|
||||
app_name="test_app",
|
||||
eval_set_id="test_eval_set",
|
||||
eval_case_id="case1",
|
||||
inferences=[
|
||||
invocation.model_copy(deep=True),
|
||||
invocation.model_copy(deep=True),
|
||||
invocation.model_copy(deep=True),
|
||||
],
|
||||
session_id="session1",
|
||||
)
|
||||
eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5)
|
||||
evaluate_config = EvaluateConfig(eval_metrics=[eval_metric], parallelism=1)
|
||||
|
||||
mock_eval_case = mocker.MagicMock(spec=EvalCase)
|
||||
mock_eval_case.eval_id = "case1"
|
||||
mock_eval_case.conversation = None
|
||||
mock_eval_case.conversation_scenario = mocker.MagicMock()
|
||||
mock_eval_case.session_input = None
|
||||
mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case
|
||||
|
||||
_, result = await eval_service._evaluate_single_inference_result(
|
||||
inference_result=inference_result, evaluate_config=evaluate_config
|
||||
)
|
||||
assert isinstance(result, EvalCaseResult)
|
||||
assert result.eval_id == "case1"
|
||||
assert result.final_eval_status == EvalStatus.NOT_EVALUATED
|
||||
assert len(result.overall_eval_metric_results) == 1
|
||||
assert result.overall_eval_metric_results[0].metric_name == "fake_metric"
|
||||
assert result.overall_eval_metric_results[0].score is None
|
||||
mock_eval_sets_manager.get_eval_case.assert_called_once_with(
|
||||
app_name="test_app", eval_set_id="test_eval_set", eval_case_id="case1"
|
||||
)
|
||||
|
||||
assert len(result.eval_metric_result_per_invocation) == 3
|
||||
|
||||
|
||||
def test_generate_final_eval_status_doesn_t_throw_on(eval_service):
|
||||
@@ -424,9 +543,11 @@ async def test_mcp_stdio_agent_no_runtime_error(mocker):
|
||||
"""Test that LocalEvalService can handle MCP stdio agents without RuntimeError.
|
||||
|
||||
This is a regression test for GitHub issue #2196:
|
||||
"RuntimeError: Attempted to exit cancel scope in a different task than it was entered in"
|
||||
"RuntimeError: Attempted to exit cancel scope in a different task than it was
|
||||
entered in"
|
||||
|
||||
The fix ensures that Runner.close() is called to properly cleanup MCP connections.
|
||||
The fix ensures that Runner.close() is called to properly cleanup MCP
|
||||
connections.
|
||||
"""
|
||||
import tempfile
|
||||
|
||||
|
||||
Reference in New Issue
Block a user