Files
adk-python/tests/unittests/cli/test_cli_eval.py
T
Ankur Sharma e88e667770 feat: Data model for Rubric based metric and eval config
Details:
- We plan on introducing Rubric based metrics in subsequent changes. This change introduces the data model needed that allows agent developer to provide rubrics.

- We also introduce a data model for the config that the eval system has been using for quite some time. It was loosely and informally described as a dictionary of metric names and expected thresholds. In this change, we actually formalize it using a pydantic data model, and extend it allow developers to specify rubrics as a part of their eval config.

What is a rubric based metric?
A rubric based metric is the assessment of a Agent's response (final or intermediate) along some rubric. This evaluation of agent's response significantly differs from the strategy where one has to provide a golden response.

PiperOrigin-RevId: 805488436
2025-09-10 13:20:07 -07:00

97 lines
3.5 KiB
Python

# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
from unittest import mock
from google.adk.cli.cli_eval import _DEFAULT_EVAL_CONFIG
from google.adk.cli.cli_eval import get_eval_metrics_from_config
from google.adk.cli.cli_eval import get_evaluation_criteria_or_default
from google.adk.evaluation.eval_config import EvalConfig
from google.adk.evaluation.eval_rubrics import Rubric
from google.adk.evaluation.eval_rubrics import RubricContent
def test_get_evaluation_criteria_or_default_returns_default():
assert get_evaluation_criteria_or_default("") == _DEFAULT_EVAL_CONFIG
def test_get_evaluation_criteria_or_default_reads_from_file():
eval_config = EvalConfig(
criteria={"tool_trajectory_avg_score": 0.5, "response_match_score": 0.5}
)
mock_open = mock.mock_open(read_data=eval_config.model_dump_json())
with mock.patch("builtins.open", mock_open):
assert get_evaluation_criteria_or_default("dummy_path") == eval_config
def test_get_eval_metrics_from_config():
rubric_1 = Rubric(
rubric_id="test-rubric",
rubric_content=RubricContent(text_property="test"),
)
eval_config = EvalConfig(
criteria={
"tool_trajectory_avg_score": 1.0,
"response_match_score": 0.8,
"final_response_match_v2": {
"threshold": 0.5,
"judge_model_options": {
"judge_model": "gemini-pro",
"num_samples": 1,
},
},
"rubric_based_final_response_quality_v1": {
"threshold": 0.9,
"judge_model_options": {
"judge_model": "gemini-ultra",
"num_samples": 1,
},
"rubrics": [rubric_1],
},
}
)
eval_metrics = get_eval_metrics_from_config(eval_config)
assert len(eval_metrics) == 4
assert eval_metrics[0].metric_name == "tool_trajectory_avg_score"
assert eval_metrics[0].threshold == 1.0
assert eval_metrics[0].criterion.threshold == 1.0
assert eval_metrics[1].metric_name == "response_match_score"
assert eval_metrics[1].threshold == 0.8
assert eval_metrics[1].criterion.threshold == 0.8
assert eval_metrics[2].metric_name == "final_response_match_v2"
assert eval_metrics[2].threshold == 0.5
assert eval_metrics[2].criterion.threshold == 0.5
assert (
eval_metrics[2].criterion.judge_model_options["judge_model"]
== "gemini-pro"
)
assert eval_metrics[3].metric_name == "rubric_based_final_response_quality_v1"
assert eval_metrics[3].threshold == 0.9
assert eval_metrics[3].criterion.threshold == 0.9
assert (
eval_metrics[3].criterion.judge_model_options["judge_model"]
== "gemini-ultra"
)
assert len(eval_metrics[3].criterion.rubrics) == 1
assert eval_metrics[3].criterion.rubrics[0] == rubric_1
def test_get_eval_metrics_from_config_empty_criteria():
eval_config = EvalConfig(criteria={})
eval_metrics = get_eval_metrics_from_config(eval_config)
assert not eval_metrics