feat: Support per-eval case and per-invocation rubrics in rubric-based evaluators

Co-authored-by: Joseph Pagadora <jcpagadora@google.com>
PiperOrigin-RevId: 853820099
This commit is contained in:
Joseph Pagadora
2026-01-08 11:25:24 -08:00
committed by Copybara-Service
parent 688791396a
commit 8afb99a078
8 changed files with 402 additions and 89 deletions
+6 -2
View File
@@ -201,9 +201,11 @@ def pretty_print_eval_result(eval_result: EvalCaseResult):
for r in metric_result.criterion.rubrics
}
for rubric_score in metric_result.details.rubric_scores:
rubric = rubrics_by_id.get(rubric_score.rubric_id)
rubric_text = rubrics_by_id.get(rubric_score.rubric_id)
if not rubric_text:
rubric_text = rubric_score.rubric_id
click.echo(
f"Rubric: {rubric}, "
f"Rubric: {rubric_text}, "
f"Score: {rubric_score.score}, "
f"Reasoning: {rubric_score.rationale}"
)
@@ -243,6 +245,8 @@ def pretty_print_eval_result(eval_result: EvalCaseResult):
}
for rubric_score in metric_result.details.rubric_scores:
rubric = rubrics_by_id.get(rubric_score.rubric_id)
if not rubric:
rubric = rubric_score.rubric_id
row_data[f"Rubric: {rubric}"] = (
f"Reasoning: {rubric_score.rationale}, "
f"Score: {rubric_score.score}"
+137 -67
View File
@@ -46,6 +46,7 @@ from .eval_metrics import EvalMetric
from .eval_metrics import EvalMetricResult
from .eval_metrics import EvalMetricResultDetails
from .eval_metrics import EvalMetricResultPerInvocation
from .eval_metrics import Rubric
from .eval_result import EvalCaseResult
from .eval_set import EvalCase
from .eval_set_results_manager import EvalSetResultsManager
@@ -67,6 +68,46 @@ def _get_session_id() -> str:
return f'{EVAL_SESSION_ID_PREFIX}{str(uuid.uuid4())}'
def _add_rubrics_to_invocation(
invocation: Invocation, rubrics_to_add: list[Rubric]
):
"""Adds rubrics to invocation, throwing ValueError on duplicate rubric_id."""
if not invocation.rubrics:
invocation.rubrics = []
existing_ids = {r.rubric_id for r in invocation.rubrics}
for rubric in rubrics_to_add:
if rubric.rubric_id in existing_ids:
raise ValueError(
f"Rubric with rubric_id '{rubric.rubric_id}' already exists."
)
invocation.rubrics.append(rubric)
existing_ids.add(rubric.rubric_id)
def _copy_eval_case_rubrics_to_actual_invocations(
eval_case: EvalCase, actual_invocations: list[Invocation]
):
"""Copies EvalCase level rubrics to all actual invocations."""
if hasattr(eval_case, 'rubrics') and eval_case.rubrics:
for invocation in actual_invocations:
_add_rubrics_to_invocation(invocation, eval_case.rubrics)
def _copy_invocation_rubrics_to_actual_invocations(
expected_invocations: Optional[list[Invocation]],
actual_invocations: list[Invocation],
):
"""Copies invocation level rubrics to corresponding actual invocations."""
if expected_invocations:
for actual_invocation, expected_invocation in zip(
actual_invocations, expected_invocations
):
if expected_invocation.rubrics:
_add_rubrics_to_invocation(
actual_invocation, expected_invocation.rubrics
)
@experimental
class LocalEvalService(BaseEvalService):
"""An implementation of BaseEvalService, that runs the evals locally."""
@@ -249,76 +290,27 @@ class LocalEvalService(BaseEvalService):
)
)
actual_invocations = inference_result.inferences
expected_invocations = eval_case.conversation
# 1. Copy EvalCase level rubrics to all actual invocations.
_copy_eval_case_rubrics_to_actual_invocations(eval_case, actual_invocations)
# 2. If expected invocations are present, copy invocation level
# rubrics to corresponding actual invocations.
_copy_invocation_rubrics_to_actual_invocations(
expected_invocations, actual_invocations
)
for eval_metric in evaluate_config.eval_metrics:
# Perform evaluation of the metric.
try:
with client_label_context(EVAL_CLIENT_LABEL):
evaluation_result = await self._evaluate_metric(
eval_metric=eval_metric,
actual_invocations=inference_result.inferences,
expected_invocations=eval_case.conversation,
conversation_scenario=eval_case.conversation_scenario,
)
except Exception as e:
# We intentionally catch the Exception as we don't want failures to
# affect other metric evaluation.
logger.error(
"Metric evaluation failed for metric `%s` for eval case id '%s'"
' with following error `%s`',
eval_metric.metric_name,
eval_case.eval_id,
e,
exc_info=True,
)
# We use an empty result.
evaluation_result = EvaluationResult(
overall_eval_status=EvalStatus.NOT_EVALUATED
)
# Track overall score across all invocations.
eval_metric_result_details = EvalMetricResultDetails(
rubric_scores=evaluation_result.overall_rubric_scores
await self._evaluate_metric_for_eval_case(
eval_metric,
eval_case,
inference_result,
eval_metric_result_per_invocation,
overall_eval_metric_results,
)
overall_eval_metric_results.append(
EvalMetricResult(
score=evaluation_result.overall_score,
eval_status=evaluation_result.overall_eval_status,
details=eval_metric_result_details,
**eval_metric.model_dump(),
)
)
if (
evaluation_result.overall_eval_status != EvalStatus.NOT_EVALUATED
and len(evaluation_result.per_invocation_results)
!= len(eval_metric_result_per_invocation)
):
raise ValueError(
'Eval metric should return results for each invocation. Found '
f'{len(evaluation_result.per_invocation_results)} results for '
f'{len(eval_metric_result_per_invocation)} invocations.'
)
# Track score across individual invocations.
for idx, invocation in enumerate(eval_metric_result_per_invocation):
invocation_result = (
evaluation_result.per_invocation_results[idx]
if evaluation_result.overall_eval_status != EvalStatus.NOT_EVALUATED
else PerInvocationResult(
actual_invocation=invocation.actual_invocation
)
)
eval_metric_result_details = EvalMetricResultDetails(
rubric_scores=invocation_result.rubric_scores
)
invocation.eval_metric_results.append(
EvalMetricResult(
score=invocation_result.score,
eval_status=invocation_result.eval_status,
details=eval_metric_result_details,
**eval_metric.model_dump(),
)
)
final_eval_status = self._generate_final_eval_status(
overall_eval_metric_results
@@ -342,6 +334,84 @@ class LocalEvalService(BaseEvalService):
return (inference_result, eval_case_result)
async def _evaluate_metric_for_eval_case(
self,
eval_metric: EvalMetric,
eval_case: EvalCase,
inference_result: InferenceResult,
eval_metric_result_per_invocation: list[EvalMetricResultPerInvocation],
overall_eval_metric_results: list[EvalMetricResult],
):
"""Performs evaluation of a metric for a given eval case and inference result."""
try:
with client_label_context(EVAL_CLIENT_LABEL):
evaluation_result = await self._evaluate_metric(
eval_metric=eval_metric,
actual_invocations=inference_result.inferences,
expected_invocations=eval_case.conversation,
conversation_scenario=eval_case.conversation_scenario,
)
except Exception as e:
# We intentionally catch the Exception as we don't want failures to
# affect other metric evaluation.
logger.error(
"Metric evaluation failed for metric `%s` for eval case id '%s'"
' with following error `%s`',
eval_metric.metric_name,
eval_case.eval_id,
e,
exc_info=True,
)
# We use an empty result.
evaluation_result = EvaluationResult(
overall_eval_status=EvalStatus.NOT_EVALUATED
)
# Track overall score across all invocations.
eval_metric_result_details = EvalMetricResultDetails(
rubric_scores=evaluation_result.overall_rubric_scores
)
overall_eval_metric_results.append(
EvalMetricResult(
score=evaluation_result.overall_score,
eval_status=evaluation_result.overall_eval_status,
details=eval_metric_result_details,
**eval_metric.model_dump(),
)
)
if (
evaluation_result.overall_eval_status != EvalStatus.NOT_EVALUATED
and len(evaluation_result.per_invocation_results)
!= len(eval_metric_result_per_invocation)
):
raise ValueError(
'Eval metric should return results for each invocation. Found '
f'{len(evaluation_result.per_invocation_results)} results for '
f'{len(eval_metric_result_per_invocation)} invocations.'
)
# Track score across individual invocations.
for idx, invocation in enumerate(eval_metric_result_per_invocation):
invocation_result = (
evaluation_result.per_invocation_results[idx]
if evaluation_result.overall_eval_status != EvalStatus.NOT_EVALUATED
else PerInvocationResult(
actual_invocation=invocation.actual_invocation
)
)
eval_metric_result_details = EvalMetricResultDetails(
rubric_scores=invocation_result.rubric_scores
)
invocation.eval_metric_results.append(
EvalMetricResult(
score=invocation_result.score,
eval_status=invocation_result.eval_status,
details=eval_metric_result_details,
**eval_metric.model_dump(),
)
)
async def _evaluate_metric(
self,
eval_metric: EvalMetric,
@@ -328,28 +328,67 @@ class RubricBasedEvaluator(LlmAsJudge):
assert self._criterion.rubrics, "Rubrics are required."
self._rubrics: list[Rubric] = self._criterion.rubrics
self._effective_rubrics_list: Optional[list[Rubric]] = None
self._normalized_rubric_to_id_map = {
_normalize_text(r.rubric_content.text_property): r.rubric_id
for r in self._rubrics
}
def create_effective_rubrics_list(
self,
invocation_rubrics: Optional[list[Rubric]],
) -> None:
rubrics_by_id = {}
def _add_rubrics(rubrics_to_add: list[Rubric], scope_name: str):
for r in rubrics_to_add:
if r.rubric_id in rubrics_by_id:
raise ValueError(
f"Rubric with rubric_id '{r.rubric_id}' already exists. Rubric"
f" defined in {scope_name} conflicts with an existing rubric."
)
rubrics_by_id[r.rubric_id] = r
_add_rubrics(self._rubrics, "criterion")
if invocation_rubrics:
_add_rubrics(invocation_rubrics, "invocation")
self._effective_rubrics_list = list(rubrics_by_id.values())
def get_effective_rubrics_list(self) -> list[Rubric]:
"""Returns the effective rubrics list."""
if self._effective_rubrics_list is None:
raise ValueError(
"Effective rubrics list not initialized. Call"
" create_effective_rubrics_list() first."
)
return self._effective_rubrics_list
@override
def convert_auto_rater_response_to_score(
self, auto_rater_response: LlmResponse
self,
auto_rater_response: LlmResponse,
) -> AutoRaterScore:
"""Returns an AutoRaterScore generated from AutoRater's response."""
response_text = get_text_from_content(auto_rater_response.content)
rubric_responses = self._auto_rater_response_parser.parse(response_text)
rubric_scores = []
normalized_rubric_to_rubric_map = {}
for r in self.get_effective_rubrics_list():
normalized_rubric_to_rubric_map[
_normalize_text(r.rubric_content.text_property)
] = r
for rubric_response in rubric_responses:
normalized_rubric = _normalize_text(rubric_response.property_text)
rubric_id = self._normalized_rubric_to_id_map.get(normalized_rubric, None)
if rubric_id:
normalized_rubric_text = _normalize_text(rubric_response.property_text)
rubric = normalized_rubric_to_rubric_map.get(normalized_rubric_text, None)
if rubric:
rubric_scores.append(
RubricScore(
rubric_id=rubric_id,
rubric_id=rubric.rubric_id,
rationale=rubric_response.rationale,
score=rubric_response.score,
)
@@ -25,6 +25,7 @@ from .eval_case import Invocation
from .eval_case import InvocationEvents
from .eval_metrics import EvalMetric
from .eval_metrics import RubricsBasedCriterion
from .eval_rubrics import Rubric
from .llm_as_judge_utils import get_text_from_content
from .llm_as_judge_utils import get_tool_calls_and_responses_as_json_str
from .llm_as_judge_utils import get_tool_declarations_as_json_str
@@ -264,15 +265,19 @@ class RubricBasedFinalResponseQualityV1Evaluator(RubricBasedEvaluator):
@override
def format_auto_rater_prompt(
self, actual_invocation: Invocation, _: Optional[Invocation]
self,
actual_invocation: Invocation,
_: Optional[Invocation],
) -> str:
"""Returns the autorater prompt."""
self.create_effective_rubrics_list(actual_invocation.rubrics)
user_input = get_text_from_content(actual_invocation.user_content)
final_response = get_text_from_content(actual_invocation.final_response)
rubrics = "\n* ".join(
[r.rubric_content.text_property for r in self._rubrics]
)
rubrics_text = "\n".join([
f"* {r.rubric_content.text_property}"
for r in self._effective_rubrics_list
])
developer_instructions = ""
tool_declarations = "Agent has no tools."
@@ -299,7 +304,7 @@ class RubricBasedFinalResponseQualityV1Evaluator(RubricBasedEvaluator):
user_input=user_input,
response_steps=response_steps,
final_response=final_response,
rubrics=rubrics,
rubrics=rubrics_text,
)
return auto_rater_prompt
@@ -164,17 +164,21 @@ class RubricBasedToolUseV1Evaluator(RubricBasedEvaluator):
@override
def format_auto_rater_prompt(
self, actual_invocation: Invocation, _: Optional[Invocation]
self,
actual_invocation: Invocation,
_: Optional[Invocation],
) -> str:
"""Returns the autorater prompt."""
self.create_effective_rubrics_list(actual_invocation.rubrics)
user_input = get_text_from_content(actual_invocation.user_content)
tool_usage = get_tool_calls_and_responses_as_json_str(
actual_invocation.intermediate_data
)
rubrics = "\n* ".join(
[r.rubric_content.text_property for r in self._rubrics]
)
rubrics_text = "\n".join([
f"* {r.rubric_content.text_property}"
for r in self._effective_rubrics_list
])
app_details = actual_invocation.app_details
tool_declarations = "Agent has no tools."
@@ -185,5 +189,5 @@ class RubricBasedToolUseV1Evaluator(RubricBasedEvaluator):
tool_declarations=tool_declarations,
user_input=user_input,
tool_usage=tool_usage,
rubrics=rubrics,
rubrics=rubrics_text,
)
@@ -20,6 +20,7 @@ from google.adk.evaluation.eval_case import Invocation
from google.adk.evaluation.eval_metrics import EvalMetric
from google.adk.evaluation.eval_metrics import JudgeModelOptions
from google.adk.evaluation.eval_metrics import LlmAsAJudgeCriterion
from google.adk.evaluation.eval_rubrics import Rubric
from google.adk.evaluation.evaluator import EvalStatus
from google.adk.evaluation.evaluator import EvaluationResult
from google.adk.evaluation.evaluator import PerInvocationResult
@@ -35,12 +36,17 @@ import pytest
class MockLlmAsJudge(LlmAsJudge):
def format_auto_rater_prompt(
self, actual_invocation: Invocation, expected_invocation: Invocation
self,
actual_invocation: Invocation,
expected_invocation: Optional[Invocation],
rubrics: Optional[list[Rubric]] = None,
) -> str:
return "formatted prompt"
def convert_auto_rater_response_to_score(
self, llm_response: LlmResponse
self,
llm_response: LlmResponse,
rubrics: Optional[list[Rubric]] = None,
) -> AutoRaterScore:
return AutoRaterScore(score=1.0)
@@ -34,6 +34,8 @@ from google.adk.evaluation.eval_metrics import Interval
from google.adk.evaluation.eval_metrics import MetricInfo
from google.adk.evaluation.eval_metrics import MetricValueInfo
from google.adk.evaluation.eval_result import EvalCaseResult
from google.adk.evaluation.eval_rubrics import Rubric
from google.adk.evaluation.eval_rubrics import RubricContent
from google.adk.evaluation.eval_set import EvalCase
from google.adk.evaluation.eval_set import EvalSet
from google.adk.evaluation.eval_set_results_manager import EvalSetResultsManager
@@ -42,6 +44,9 @@ from google.adk.evaluation.evaluator import EvalStatus
from google.adk.evaluation.evaluator import EvaluationResult
from google.adk.evaluation.evaluator import Evaluator
from google.adk.evaluation.evaluator import PerInvocationResult
from google.adk.evaluation.local_eval_service import _add_rubrics_to_invocation
from google.adk.evaluation.local_eval_service import _copy_eval_case_rubrics_to_actual_invocations
from google.adk.evaluation.local_eval_service import _copy_invocation_rubrics_to_actual_invocations
from google.adk.evaluation.local_eval_service import LocalEvalService
from google.adk.evaluation.metric_evaluator_registry import DEFAULT_METRIC_EVALUATOR_REGISTRY
from google.adk.models.registry import LLMRegistry
@@ -678,3 +683,111 @@ async def test_mcp_stdio_agent_no_runtime_error(mocker):
import shutil
shutil.rmtree(test_dir, ignore_errors=True)
def test_add_rubrics_to_invocation_initializes_rubrics_list():
invocation = Invocation(user_content=genai_types.Content())
rubric = Rubric(
rubric_id="r1", rubric_content=RubricContent(text_property="p1")
)
_add_rubrics_to_invocation(invocation, [rubric])
assert invocation.rubrics == [rubric]
def test_add_rubrics_to_invocation_adds_to_existing_list():
rubric1 = Rubric(
rubric_id="r1", rubric_content=RubricContent(text_property="p1")
)
rubric2 = Rubric(
rubric_id="r2", rubric_content=RubricContent(text_property="p2")
)
invocation = Invocation(user_content=genai_types.Content(), rubrics=[rubric1])
_add_rubrics_to_invocation(invocation, [rubric2])
assert invocation.rubrics == [rubric1, rubric2]
def test_add_rubrics_to_invocation_errors_on_duplicate_id():
rubric1 = Rubric(
rubric_id="r1", rubric_content=RubricContent(text_property="p1")
)
rubric2 = Rubric(
rubric_id="r1", rubric_content=RubricContent(text_property="p2")
)
invocation = Invocation(user_content=genai_types.Content(), rubrics=[rubric1])
with pytest.raises(ValueError):
_add_rubrics_to_invocation(invocation, [rubric2])
def test_copy_eval_case_rubrics_to_actual_invocations():
rubric1 = Rubric(
rubric_id="r1", rubric_content=RubricContent(text_property="p1")
)
eval_case = EvalCase(
eval_id="case1",
conversation=[
Invocation(
user_content=genai_types.Content(
parts=[genai_types.Part(text="expected invocation 1.")]
)
),
Invocation(
user_content=genai_types.Content(
parts=[genai_types.Part(text="expected invocation 2.")]
)
),
],
rubrics=[rubric1],
)
invocations = [
Invocation(
user_content=genai_types.Content(
parts=[genai_types.Part(text="actual invocation 1.")]
)
),
Invocation(
user_content=genai_types.Content(
parts=[genai_types.Part(text="actual invocation 2.")]
)
),
]
_copy_eval_case_rubrics_to_actual_invocations(eval_case, invocations)
assert invocations[0].rubrics == [rubric1]
assert invocations[1].rubrics == [rubric1]
def test_copy_invocation_rubrics_to_actual_invocations():
rubric1 = Rubric(
rubric_id="r1", rubric_content=RubricContent(text_property="p1")
)
rubric2 = Rubric(
rubric_id="r2", rubric_content=RubricContent(text_property="p2")
)
expected = [
Invocation(
user_content=genai_types.Content(
parts=[genai_types.Part(text="expected invocation 1.")]
),
rubrics=[rubric1],
),
Invocation(
user_content=genai_types.Content(
parts=[genai_types.Part(text="expected invocation 2.")]
),
rubrics=[rubric2],
),
]
actual = [
Invocation(
user_content=genai_types.Content(
parts=[genai_types.Part(text="actual invocation 1.")]
)
),
Invocation(
user_content=genai_types.Content(
parts=[genai_types.Part(text="actual invocation 2.")]
)
),
]
_copy_invocation_rubrics_to_actual_invocations(expected, actual)
assert actual[0].rubrics == [rubric1]
assert actual[1].rubrics == [rubric2]
@@ -465,6 +465,7 @@ class TestRubricBasedEvaluator:
evaluator: RubricBasedEvaluator,
):
"""Tests convert_auto_rater_response_to_score with an empty response."""
evaluator.create_effective_rubrics_list(None)
response = LlmResponse(
content=genai_types.Content(parts=[genai_types.Part(text="")])
)
@@ -477,6 +478,7 @@ class TestRubricBasedEvaluator:
evaluator: RubricBasedEvaluator,
):
"""Tests convert_auto_rater_response_to_score with a malformed response."""
evaluator.create_effective_rubrics_list(None)
response = LlmResponse(
content=genai_types.Content(
parts=[genai_types.Part(text="This is not a valid format.")]
@@ -491,6 +493,7 @@ class TestRubricBasedEvaluator:
evaluator: RubricBasedEvaluator,
):
"""Tests convert_auto_rater_response_to_score with mixed verdicts."""
evaluator.create_effective_rubrics_list(None)
response_text = """
Property: Is the response good?
Rationale: It was good.
@@ -515,6 +518,7 @@ class TestRubricBasedEvaluator:
evaluator: RubricBasedEvaluator,
):
"""Tests convert_auto_rater_response_to_score with an invalid verdict."""
evaluator.create_effective_rubrics_list(None)
response_text = """
Property: Is the response good?
Rationale: It was good.
@@ -539,6 +543,7 @@ class TestRubricBasedEvaluator:
evaluator: RubricBasedEvaluator,
):
"""Tests convert_auto_rater_response_to_score with an unknown property."""
evaluator.create_effective_rubrics_list(None)
response_text = """
Property: Is the response amazing?
Rationale: It was amazing.
@@ -551,4 +556,71 @@ class TestRubricBasedEvaluator:
)
auto_rater_score = evaluator.convert_auto_rater_response_to_score(response)
assert auto_rater_score.score is None
assert len(auto_rater_score.rubric_scores) == 0
assert not auto_rater_score.rubric_scores
def test_create_effective_rubrics_list_with_invocation_rubrics(
self, evaluator: RubricBasedEvaluator
):
invocation_rubrics = [
Rubric(
rubric_id="3",
rubric_content=RubricContent(text_property="Invocation rubric"),
)
]
evaluator.create_effective_rubrics_list(invocation_rubrics)
effective_rubrics = evaluator.get_effective_rubrics_list()
assert len(effective_rubrics) == 3
assert {r.rubric_id for r in effective_rubrics} == {"1", "2", "3"}
def test_create_effective_rubrics_list_with_duplicate_invocation_rubric_id(
self, evaluator: RubricBasedEvaluator
):
invocation_rubrics = [
Rubric(
rubric_id="1",
rubric_content=RubricContent(text_property="Invocation rubric"),
)
]
with pytest.raises(ValueError):
evaluator.create_effective_rubrics_list(invocation_rubrics)
def test_create_effective_rubrics_list_with_no_invocation_rubrics(
self, evaluator: RubricBasedEvaluator
):
evaluator.create_effective_rubrics_list(None)
effective_rubrics = evaluator.get_effective_rubrics_list()
assert len(effective_rubrics) == 2
assert {r.rubric_id for r in effective_rubrics} == {"1", "2"}
def test_get_effective_rubrics_list_before_creation_raises_error(
self, evaluator: RubricBasedEvaluator
):
with pytest.raises(
ValueError, match="Effective rubrics list not initialized."
):
evaluator.get_effective_rubrics_list()
def test_create_effective_rubrics_list_multiple_calls(
self, evaluator: RubricBasedEvaluator
):
invocation_rubrics1 = [
Rubric(
rubric_id="3",
rubric_content=RubricContent(text_property="Invocation rubric 1"),
)
]
evaluator.create_effective_rubrics_list(invocation_rubrics1)
effective_rubrics1 = evaluator.get_effective_rubrics_list()
assert len(effective_rubrics1) == 3
assert {r.rubric_id for r in effective_rubrics1} == {"1", "2", "3"}
invocation_rubrics2 = [
Rubric(
rubric_id="4",
rubric_content=RubricContent(text_property="Invocation rubric 2"),
)
]
evaluator.create_effective_rubrics_list(invocation_rubrics2)
effective_rubrics2 = evaluator.get_effective_rubrics_list()
assert len(effective_rubrics2) == 3
assert {r.rubric_id for r in effective_rubrics2} == {"1", "2", "4"}