You've already forked adk-python
mirror of
https://github.com/encounter/adk-python.git
synced 2026-03-30 10:57:20 -07:00
feat: Enable FinalResponseMatchV2 metric as an experiment
PiperOrigin-RevId: 784346859
This commit is contained in:
committed by
Copybara-Service
parent
35de210d4e
commit
36e45cdab3
@@ -15,6 +15,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib.util
|
||||
import inspect
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
@@ -31,6 +32,7 @@ from ..evaluation.eval_case import EvalCase
|
||||
from ..evaluation.eval_metrics import EvalMetric
|
||||
from ..evaluation.eval_metrics import EvalMetricResult
|
||||
from ..evaluation.eval_metrics import EvalMetricResultPerInvocation
|
||||
from ..evaluation.eval_metrics import JudgeModelOptions
|
||||
from ..evaluation.eval_result import EvalCaseResult
|
||||
from ..evaluation.evaluator import EvalStatus
|
||||
from ..evaluation.evaluator import Evaluator
|
||||
@@ -42,6 +44,7 @@ logger = logging.getLogger("google_adk." + __name__)
|
||||
TOOL_TRAJECTORY_SCORE_KEY = "tool_trajectory_avg_score"
|
||||
RESPONSE_MATCH_SCORE_KEY = "response_match_score"
|
||||
SAFETY_V1_KEY = "safety_v1"
|
||||
FINAL_RESPONSE_MATCH_V2 = "final_response_match_v2"
|
||||
# This evaluation is not very stable.
|
||||
# This is always optional unless explicitly specified.
|
||||
RESPONSE_EVALUATION_SCORE_KEY = "response_evaluation_score"
|
||||
@@ -191,10 +194,16 @@ async def run_evals(
|
||||
for eval_metric in eval_metrics:
|
||||
metric_evaluator = _get_evaluator(eval_metric)
|
||||
|
||||
evaluation_result = metric_evaluator.evaluate_invocations(
|
||||
actual_invocations=inference_result,
|
||||
expected_invocations=eval_case.conversation,
|
||||
)
|
||||
if inspect.iscoroutinefunction(metric_evaluator.evaluate_invocations):
|
||||
evaluation_result = await metric_evaluator.evaluate_invocations(
|
||||
actual_invocations=inference_result,
|
||||
expected_invocations=eval_case.conversation,
|
||||
)
|
||||
else:
|
||||
evaluation_result = metric_evaluator.evaluate_invocations(
|
||||
actual_invocations=inference_result,
|
||||
expected_invocations=eval_case.conversation,
|
||||
)
|
||||
|
||||
overall_eval_metric_results.append(
|
||||
EvalMetricResult(
|
||||
@@ -260,6 +269,7 @@ async def run_evals(
|
||||
|
||||
def _get_evaluator(eval_metric: EvalMetric) -> Evaluator:
|
||||
try:
|
||||
from ..evaluation.final_response_match_v2 import FinalResponseMatchV2Evaluator
|
||||
from ..evaluation.response_evaluator import ResponseEvaluator
|
||||
from ..evaluation.safety_evaluator import SafetyEvaluatorV1
|
||||
from ..evaluation.trajectory_evaluator import TrajectoryEvaluator
|
||||
@@ -276,5 +286,8 @@ def _get_evaluator(eval_metric: EvalMetric) -> Evaluator:
|
||||
)
|
||||
elif eval_metric.metric_name == SAFETY_V1_KEY:
|
||||
return SafetyEvaluatorV1(eval_metric)
|
||||
elif eval_metric.metric_name == FINAL_RESPONSE_MATCH_V2:
|
||||
eval_metric.judge_model_options = JudgeModelOptions()
|
||||
return FinalResponseMatchV2Evaluator(eval_metric)
|
||||
|
||||
raise ValueError(f"Unsupported eval metric: {eval_metric}")
|
||||
|
||||
@@ -36,6 +36,10 @@ class PrebuiltMetrics(Enum):
|
||||
|
||||
RESPONSE_MATCH_SCORE = "response_match_score"
|
||||
|
||||
SAFETY_V1 = "safety_v1"
|
||||
|
||||
FINAL_RESPONSE_MATCH_V2 = "final_response_match_v2"
|
||||
|
||||
|
||||
MetricName: TypeAlias = Union[str, PrebuiltMetrics]
|
||||
|
||||
|
||||
@@ -21,7 +21,7 @@ from typing import Optional
|
||||
from typing_extensions import override
|
||||
|
||||
from ..models.llm_response import LlmResponse
|
||||
from ..utils.feature_decorator import working_in_progress
|
||||
from ..utils.feature_decorator import experimental
|
||||
from .eval_case import Invocation
|
||||
from .eval_metrics import EvalMetric
|
||||
from .evaluator import EvalStatus
|
||||
@@ -125,7 +125,7 @@ def _parse_critique(response: str) -> Label:
|
||||
return label
|
||||
|
||||
|
||||
@working_in_progress
|
||||
@experimental
|
||||
class FinalResponseMatchV2Evaluator(LlmAsJudge):
|
||||
"""V2 final response match evaluator which uses an LLM to judge responses.
|
||||
|
||||
|
||||
@@ -21,7 +21,9 @@ from .eval_metrics import EvalMetric
|
||||
from .eval_metrics import MetricName
|
||||
from .eval_metrics import PrebuiltMetrics
|
||||
from .evaluator import Evaluator
|
||||
from .final_response_match_v2 import FinalResponseMatchV2Evaluator
|
||||
from .response_evaluator import ResponseEvaluator
|
||||
from .safety_evaluator import SafetyEvaluatorV1
|
||||
from .trajectory_evaluator import TrajectoryEvaluator
|
||||
|
||||
logger = logging.getLogger("google_adk." + __name__)
|
||||
@@ -71,16 +73,24 @@ def _get_default_metric_evaluator_registry() -> MetricEvaluatorRegistry:
|
||||
metric_evaluator_registry = MetricEvaluatorRegistry()
|
||||
|
||||
metric_evaluator_registry.register_evaluator(
|
||||
metric_name=PrebuiltMetrics.TOOL_TRAJECTORY_AVG_SCORE,
|
||||
evaluator=type(TrajectoryEvaluator),
|
||||
metric_name=PrebuiltMetrics.TOOL_TRAJECTORY_AVG_SCORE.value,
|
||||
evaluator=TrajectoryEvaluator,
|
||||
)
|
||||
metric_evaluator_registry.register_evaluator(
|
||||
metric_name=PrebuiltMetrics.RESPONSE_EVALUATION_SCORE,
|
||||
evaluator=type(ResponseEvaluator),
|
||||
metric_name=PrebuiltMetrics.RESPONSE_EVALUATION_SCORE.value,
|
||||
evaluator=ResponseEvaluator,
|
||||
)
|
||||
metric_evaluator_registry.register_evaluator(
|
||||
metric_name=PrebuiltMetrics.RESPONSE_MATCH_SCORE,
|
||||
evaluator=type(ResponseEvaluator),
|
||||
metric_name=PrebuiltMetrics.RESPONSE_MATCH_SCORE.value,
|
||||
evaluator=ResponseEvaluator,
|
||||
)
|
||||
metric_evaluator_registry.register_evaluator(
|
||||
metric_name=PrebuiltMetrics.SAFETY_V1.value,
|
||||
evaluator=SafetyEvaluatorV1,
|
||||
)
|
||||
metric_evaluator_registry.register_evaluator(
|
||||
metric_name=PrebuiltMetrics.FINAL_RESPONSE_MATCH_V2.value,
|
||||
evaluator=FinalResponseMatchV2Evaluator,
|
||||
)
|
||||
|
||||
return metric_evaluator_registry
|
||||
|
||||
Reference in New Issue
Block a user