feat: Enable FinalResponseMatchV2 metric as an experiment

PiperOrigin-RevId: 784346859
2026-03-30 10:57:20 -07:00 · 2025-07-17 15:59:16 -07:00
parent 35de210d4e
commit 36e45cdab3
4 changed files with 39 additions and 12 deletions
@@ -15,6 +15,7 @@
 from __future__ import annotations

 import importlib.util
+import inspect
 import json
 import logging
 import os
@@ -31,6 +32,7 @@ from ..evaluation.eval_case import EvalCase
 from ..evaluation.eval_metrics import EvalMetric
 from ..evaluation.eval_metrics import EvalMetricResult
 from ..evaluation.eval_metrics import EvalMetricResultPerInvocation
+from ..evaluation.eval_metrics import JudgeModelOptions
 from ..evaluation.eval_result import EvalCaseResult
 from ..evaluation.evaluator import EvalStatus
 from ..evaluation.evaluator import Evaluator
@@ -42,6 +44,7 @@ logger = logging.getLogger("google_adk." + __name__)
 TOOL_TRAJECTORY_SCORE_KEY = "tool_trajectory_avg_score"
 RESPONSE_MATCH_SCORE_KEY = "response_match_score"
 SAFETY_V1_KEY = "safety_v1"
+FINAL_RESPONSE_MATCH_V2 = "final_response_match_v2"
 # This evaluation is not very stable.
 # This is always optional unless explicitly specified.
 RESPONSE_EVALUATION_SCORE_KEY = "response_evaluation_score"
@@ -191,10 +194,16 @@ async def run_evals(
        for eval_metric in eval_metrics:
          metric_evaluator = _get_evaluator(eval_metric)

-          evaluation_result = metric_evaluator.evaluate_invocations(
-              actual_invocations=inference_result,
-              expected_invocations=eval_case.conversation,
-          )
+          if inspect.iscoroutinefunction(metric_evaluator.evaluate_invocations):
+            evaluation_result = await metric_evaluator.evaluate_invocations(
+                actual_invocations=inference_result,
+                expected_invocations=eval_case.conversation,
+            )
+          else:
+            evaluation_result = metric_evaluator.evaluate_invocations(
+                actual_invocations=inference_result,
+                expected_invocations=eval_case.conversation,
+            )

          overall_eval_metric_results.append(
              EvalMetricResult(
@@ -260,6 +269,7 @@ async def run_evals(

 def _get_evaluator(eval_metric: EvalMetric) -> Evaluator:
  try:
+    from ..evaluation.final_response_match_v2 import FinalResponseMatchV2Evaluator
    from ..evaluation.response_evaluator import ResponseEvaluator
    from ..evaluation.safety_evaluator import SafetyEvaluatorV1
    from ..evaluation.trajectory_evaluator import TrajectoryEvaluator
@@ -276,5 +286,8 @@ def _get_evaluator(eval_metric: EvalMetric) -> Evaluator:
    )
  elif eval_metric.metric_name == SAFETY_V1_KEY:
    return SafetyEvaluatorV1(eval_metric)
+  elif eval_metric.metric_name == FINAL_RESPONSE_MATCH_V2:
+    eval_metric.judge_model_options = JudgeModelOptions()
+    return FinalResponseMatchV2Evaluator(eval_metric)

  raise ValueError(f"Unsupported eval metric: {eval_metric}")
@@ -36,6 +36,10 @@ class PrebuiltMetrics(Enum):

  RESPONSE_MATCH_SCORE = "response_match_score"

+  SAFETY_V1 = "safety_v1"
+
+  FINAL_RESPONSE_MATCH_V2 = "final_response_match_v2"
+

 MetricName: TypeAlias = Union[str, PrebuiltMetrics]

@@ -21,7 +21,7 @@ from typing import Optional
 from typing_extensions import override

 from ..models.llm_response import LlmResponse
-from ..utils.feature_decorator import working_in_progress
+from ..utils.feature_decorator import experimental
 from .eval_case import Invocation
 from .eval_metrics import EvalMetric
 from .evaluator import EvalStatus
@@ -125,7 +125,7 @@ def _parse_critique(response: str) -> Label:
  return label


-@working_in_progress
+@experimental
 class FinalResponseMatchV2Evaluator(LlmAsJudge):
  """V2 final response match evaluator which uses an LLM to judge responses.

@@ -21,7 +21,9 @@ from .eval_metrics import EvalMetric
 from .eval_metrics import MetricName
 from .eval_metrics import PrebuiltMetrics
 from .evaluator import Evaluator
+from .final_response_match_v2 import FinalResponseMatchV2Evaluator
 from .response_evaluator import ResponseEvaluator
+from .safety_evaluator import SafetyEvaluatorV1
 from .trajectory_evaluator import TrajectoryEvaluator

 logger = logging.getLogger("google_adk." + __name__)
@@ -71,16 +73,24 @@ def _get_default_metric_evaluator_registry() -> MetricEvaluatorRegistry:
  metric_evaluator_registry = MetricEvaluatorRegistry()

  metric_evaluator_registry.register_evaluator(
-      metric_name=PrebuiltMetrics.TOOL_TRAJECTORY_AVG_SCORE,
-      evaluator=type(TrajectoryEvaluator),
+      metric_name=PrebuiltMetrics.TOOL_TRAJECTORY_AVG_SCORE.value,
+      evaluator=TrajectoryEvaluator,
  )
  metric_evaluator_registry.register_evaluator(
-      metric_name=PrebuiltMetrics.RESPONSE_EVALUATION_SCORE,
-      evaluator=type(ResponseEvaluator),
+      metric_name=PrebuiltMetrics.RESPONSE_EVALUATION_SCORE.value,
+      evaluator=ResponseEvaluator,
  )
  metric_evaluator_registry.register_evaluator(
-      metric_name=PrebuiltMetrics.RESPONSE_MATCH_SCORE,
-      evaluator=type(ResponseEvaluator),
+      metric_name=PrebuiltMetrics.RESPONSE_MATCH_SCORE.value,
+      evaluator=ResponseEvaluator,
+  )
+  metric_evaluator_registry.register_evaluator(
+      metric_name=PrebuiltMetrics.SAFETY_V1.value,
+      evaluator=SafetyEvaluatorV1,
+  )
+  metric_evaluator_registry.register_evaluator(
+      metric_name=PrebuiltMetrics.FINAL_RESPONSE_MATCH_V2.value,
+      evaluator=FinalResponseMatchV2Evaluator,
  )

  return metric_evaluator_registry