feat: Enable FinalResponseMatchV2 metric as an experiment

PiperOrigin-RevId: 784346859
This commit is contained in:
Ankur Sharma
2025-07-17 15:59:16 -07:00
committed by Copybara-Service
parent 35de210d4e
commit 36e45cdab3
4 changed files with 39 additions and 12 deletions
+17 -4
View File
@@ -15,6 +15,7 @@
from __future__ import annotations
import importlib.util
import inspect
import json
import logging
import os
@@ -31,6 +32,7 @@ from ..evaluation.eval_case import EvalCase
from ..evaluation.eval_metrics import EvalMetric
from ..evaluation.eval_metrics import EvalMetricResult
from ..evaluation.eval_metrics import EvalMetricResultPerInvocation
from ..evaluation.eval_metrics import JudgeModelOptions
from ..evaluation.eval_result import EvalCaseResult
from ..evaluation.evaluator import EvalStatus
from ..evaluation.evaluator import Evaluator
@@ -42,6 +44,7 @@ logger = logging.getLogger("google_adk." + __name__)
TOOL_TRAJECTORY_SCORE_KEY = "tool_trajectory_avg_score"
RESPONSE_MATCH_SCORE_KEY = "response_match_score"
SAFETY_V1_KEY = "safety_v1"
FINAL_RESPONSE_MATCH_V2 = "final_response_match_v2"
# This evaluation is not very stable.
# This is always optional unless explicitly specified.
RESPONSE_EVALUATION_SCORE_KEY = "response_evaluation_score"
@@ -191,10 +194,16 @@ async def run_evals(
for eval_metric in eval_metrics:
metric_evaluator = _get_evaluator(eval_metric)
evaluation_result = metric_evaluator.evaluate_invocations(
actual_invocations=inference_result,
expected_invocations=eval_case.conversation,
)
if inspect.iscoroutinefunction(metric_evaluator.evaluate_invocations):
evaluation_result = await metric_evaluator.evaluate_invocations(
actual_invocations=inference_result,
expected_invocations=eval_case.conversation,
)
else:
evaluation_result = metric_evaluator.evaluate_invocations(
actual_invocations=inference_result,
expected_invocations=eval_case.conversation,
)
overall_eval_metric_results.append(
EvalMetricResult(
@@ -260,6 +269,7 @@ async def run_evals(
def _get_evaluator(eval_metric: EvalMetric) -> Evaluator:
try:
from ..evaluation.final_response_match_v2 import FinalResponseMatchV2Evaluator
from ..evaluation.response_evaluator import ResponseEvaluator
from ..evaluation.safety_evaluator import SafetyEvaluatorV1
from ..evaluation.trajectory_evaluator import TrajectoryEvaluator
@@ -276,5 +286,8 @@ def _get_evaluator(eval_metric: EvalMetric) -> Evaluator:
)
elif eval_metric.metric_name == SAFETY_V1_KEY:
return SafetyEvaluatorV1(eval_metric)
elif eval_metric.metric_name == FINAL_RESPONSE_MATCH_V2:
eval_metric.judge_model_options = JudgeModelOptions()
return FinalResponseMatchV2Evaluator(eval_metric)
raise ValueError(f"Unsupported eval metric: {eval_metric}")
@@ -36,6 +36,10 @@ class PrebuiltMetrics(Enum):
RESPONSE_MATCH_SCORE = "response_match_score"
SAFETY_V1 = "safety_v1"
FINAL_RESPONSE_MATCH_V2 = "final_response_match_v2"
MetricName: TypeAlias = Union[str, PrebuiltMetrics]
@@ -21,7 +21,7 @@ from typing import Optional
from typing_extensions import override
from ..models.llm_response import LlmResponse
from ..utils.feature_decorator import working_in_progress
from ..utils.feature_decorator import experimental
from .eval_case import Invocation
from .eval_metrics import EvalMetric
from .evaluator import EvalStatus
@@ -125,7 +125,7 @@ def _parse_critique(response: str) -> Label:
return label
@working_in_progress
@experimental
class FinalResponseMatchV2Evaluator(LlmAsJudge):
"""V2 final response match evaluator which uses an LLM to judge responses.
@@ -21,7 +21,9 @@ from .eval_metrics import EvalMetric
from .eval_metrics import MetricName
from .eval_metrics import PrebuiltMetrics
from .evaluator import Evaluator
from .final_response_match_v2 import FinalResponseMatchV2Evaluator
from .response_evaluator import ResponseEvaluator
from .safety_evaluator import SafetyEvaluatorV1
from .trajectory_evaluator import TrajectoryEvaluator
logger = logging.getLogger("google_adk." + __name__)
@@ -71,16 +73,24 @@ def _get_default_metric_evaluator_registry() -> MetricEvaluatorRegistry:
metric_evaluator_registry = MetricEvaluatorRegistry()
metric_evaluator_registry.register_evaluator(
metric_name=PrebuiltMetrics.TOOL_TRAJECTORY_AVG_SCORE,
evaluator=type(TrajectoryEvaluator),
metric_name=PrebuiltMetrics.TOOL_TRAJECTORY_AVG_SCORE.value,
evaluator=TrajectoryEvaluator,
)
metric_evaluator_registry.register_evaluator(
metric_name=PrebuiltMetrics.RESPONSE_EVALUATION_SCORE,
evaluator=type(ResponseEvaluator),
metric_name=PrebuiltMetrics.RESPONSE_EVALUATION_SCORE.value,
evaluator=ResponseEvaluator,
)
metric_evaluator_registry.register_evaluator(
metric_name=PrebuiltMetrics.RESPONSE_MATCH_SCORE,
evaluator=type(ResponseEvaluator),
metric_name=PrebuiltMetrics.RESPONSE_MATCH_SCORE.value,
evaluator=ResponseEvaluator,
)
metric_evaluator_registry.register_evaluator(
metric_name=PrebuiltMetrics.SAFETY_V1.value,
evaluator=SafetyEvaluatorV1,
)
metric_evaluator_registry.register_evaluator(
metric_name=PrebuiltMetrics.FINAL_RESPONSE_MATCH_V2.value,
evaluator=FinalResponseMatchV2Evaluator,
)
return metric_evaluator_registry