You've already forked adk-python
mirror of
https://github.com/encounter/adk-python.git
synced 2026-03-30 10:57:20 -07:00
feat: Support per-eval case and per-invocation rubrics in rubric-based evaluators
Co-authored-by: Joseph Pagadora <jcpagadora@google.com> PiperOrigin-RevId: 853820099
This commit is contained in:
committed by
Copybara-Service
parent
688791396a
commit
8afb99a078
@@ -201,9 +201,11 @@ def pretty_print_eval_result(eval_result: EvalCaseResult):
|
||||
for r in metric_result.criterion.rubrics
|
||||
}
|
||||
for rubric_score in metric_result.details.rubric_scores:
|
||||
rubric = rubrics_by_id.get(rubric_score.rubric_id)
|
||||
rubric_text = rubrics_by_id.get(rubric_score.rubric_id)
|
||||
if not rubric_text:
|
||||
rubric_text = rubric_score.rubric_id
|
||||
click.echo(
|
||||
f"Rubric: {rubric}, "
|
||||
f"Rubric: {rubric_text}, "
|
||||
f"Score: {rubric_score.score}, "
|
||||
f"Reasoning: {rubric_score.rationale}"
|
||||
)
|
||||
@@ -243,6 +245,8 @@ def pretty_print_eval_result(eval_result: EvalCaseResult):
|
||||
}
|
||||
for rubric_score in metric_result.details.rubric_scores:
|
||||
rubric = rubrics_by_id.get(rubric_score.rubric_id)
|
||||
if not rubric:
|
||||
rubric = rubric_score.rubric_id
|
||||
row_data[f"Rubric: {rubric}"] = (
|
||||
f"Reasoning: {rubric_score.rationale}, "
|
||||
f"Score: {rubric_score.score}"
|
||||
|
||||
@@ -46,6 +46,7 @@ from .eval_metrics import EvalMetric
|
||||
from .eval_metrics import EvalMetricResult
|
||||
from .eval_metrics import EvalMetricResultDetails
|
||||
from .eval_metrics import EvalMetricResultPerInvocation
|
||||
from .eval_metrics import Rubric
|
||||
from .eval_result import EvalCaseResult
|
||||
from .eval_set import EvalCase
|
||||
from .eval_set_results_manager import EvalSetResultsManager
|
||||
@@ -67,6 +68,46 @@ def _get_session_id() -> str:
|
||||
return f'{EVAL_SESSION_ID_PREFIX}{str(uuid.uuid4())}'
|
||||
|
||||
|
||||
def _add_rubrics_to_invocation(
|
||||
invocation: Invocation, rubrics_to_add: list[Rubric]
|
||||
):
|
||||
"""Adds rubrics to invocation, throwing ValueError on duplicate rubric_id."""
|
||||
if not invocation.rubrics:
|
||||
invocation.rubrics = []
|
||||
existing_ids = {r.rubric_id for r in invocation.rubrics}
|
||||
for rubric in rubrics_to_add:
|
||||
if rubric.rubric_id in existing_ids:
|
||||
raise ValueError(
|
||||
f"Rubric with rubric_id '{rubric.rubric_id}' already exists."
|
||||
)
|
||||
invocation.rubrics.append(rubric)
|
||||
existing_ids.add(rubric.rubric_id)
|
||||
|
||||
|
||||
def _copy_eval_case_rubrics_to_actual_invocations(
|
||||
eval_case: EvalCase, actual_invocations: list[Invocation]
|
||||
):
|
||||
"""Copies EvalCase level rubrics to all actual invocations."""
|
||||
if hasattr(eval_case, 'rubrics') and eval_case.rubrics:
|
||||
for invocation in actual_invocations:
|
||||
_add_rubrics_to_invocation(invocation, eval_case.rubrics)
|
||||
|
||||
|
||||
def _copy_invocation_rubrics_to_actual_invocations(
|
||||
expected_invocations: Optional[list[Invocation]],
|
||||
actual_invocations: list[Invocation],
|
||||
):
|
||||
"""Copies invocation level rubrics to corresponding actual invocations."""
|
||||
if expected_invocations:
|
||||
for actual_invocation, expected_invocation in zip(
|
||||
actual_invocations, expected_invocations
|
||||
):
|
||||
if expected_invocation.rubrics:
|
||||
_add_rubrics_to_invocation(
|
||||
actual_invocation, expected_invocation.rubrics
|
||||
)
|
||||
|
||||
|
||||
@experimental
|
||||
class LocalEvalService(BaseEvalService):
|
||||
"""An implementation of BaseEvalService, that runs the evals locally."""
|
||||
@@ -249,76 +290,27 @@ class LocalEvalService(BaseEvalService):
|
||||
)
|
||||
)
|
||||
|
||||
actual_invocations = inference_result.inferences
|
||||
expected_invocations = eval_case.conversation
|
||||
|
||||
# 1. Copy EvalCase level rubrics to all actual invocations.
|
||||
_copy_eval_case_rubrics_to_actual_invocations(eval_case, actual_invocations)
|
||||
|
||||
# 2. If expected invocations are present, copy invocation level
|
||||
# rubrics to corresponding actual invocations.
|
||||
_copy_invocation_rubrics_to_actual_invocations(
|
||||
expected_invocations, actual_invocations
|
||||
)
|
||||
|
||||
for eval_metric in evaluate_config.eval_metrics:
|
||||
# Perform evaluation of the metric.
|
||||
try:
|
||||
with client_label_context(EVAL_CLIENT_LABEL):
|
||||
evaluation_result = await self._evaluate_metric(
|
||||
eval_metric=eval_metric,
|
||||
actual_invocations=inference_result.inferences,
|
||||
expected_invocations=eval_case.conversation,
|
||||
conversation_scenario=eval_case.conversation_scenario,
|
||||
)
|
||||
except Exception as e:
|
||||
# We intentionally catch the Exception as we don't want failures to
|
||||
# affect other metric evaluation.
|
||||
logger.error(
|
||||
"Metric evaluation failed for metric `%s` for eval case id '%s'"
|
||||
' with following error `%s`',
|
||||
eval_metric.metric_name,
|
||||
eval_case.eval_id,
|
||||
e,
|
||||
exc_info=True,
|
||||
)
|
||||
# We use an empty result.
|
||||
evaluation_result = EvaluationResult(
|
||||
overall_eval_status=EvalStatus.NOT_EVALUATED
|
||||
)
|
||||
|
||||
# Track overall score across all invocations.
|
||||
eval_metric_result_details = EvalMetricResultDetails(
|
||||
rubric_scores=evaluation_result.overall_rubric_scores
|
||||
await self._evaluate_metric_for_eval_case(
|
||||
eval_metric,
|
||||
eval_case,
|
||||
inference_result,
|
||||
eval_metric_result_per_invocation,
|
||||
overall_eval_metric_results,
|
||||
)
|
||||
overall_eval_metric_results.append(
|
||||
EvalMetricResult(
|
||||
score=evaluation_result.overall_score,
|
||||
eval_status=evaluation_result.overall_eval_status,
|
||||
details=eval_metric_result_details,
|
||||
**eval_metric.model_dump(),
|
||||
)
|
||||
)
|
||||
|
||||
if (
|
||||
evaluation_result.overall_eval_status != EvalStatus.NOT_EVALUATED
|
||||
and len(evaluation_result.per_invocation_results)
|
||||
!= len(eval_metric_result_per_invocation)
|
||||
):
|
||||
raise ValueError(
|
||||
'Eval metric should return results for each invocation. Found '
|
||||
f'{len(evaluation_result.per_invocation_results)} results for '
|
||||
f'{len(eval_metric_result_per_invocation)} invocations.'
|
||||
)
|
||||
|
||||
# Track score across individual invocations.
|
||||
for idx, invocation in enumerate(eval_metric_result_per_invocation):
|
||||
invocation_result = (
|
||||
evaluation_result.per_invocation_results[idx]
|
||||
if evaluation_result.overall_eval_status != EvalStatus.NOT_EVALUATED
|
||||
else PerInvocationResult(
|
||||
actual_invocation=invocation.actual_invocation
|
||||
)
|
||||
)
|
||||
eval_metric_result_details = EvalMetricResultDetails(
|
||||
rubric_scores=invocation_result.rubric_scores
|
||||
)
|
||||
invocation.eval_metric_results.append(
|
||||
EvalMetricResult(
|
||||
score=invocation_result.score,
|
||||
eval_status=invocation_result.eval_status,
|
||||
details=eval_metric_result_details,
|
||||
**eval_metric.model_dump(),
|
||||
)
|
||||
)
|
||||
|
||||
final_eval_status = self._generate_final_eval_status(
|
||||
overall_eval_metric_results
|
||||
@@ -342,6 +334,84 @@ class LocalEvalService(BaseEvalService):
|
||||
|
||||
return (inference_result, eval_case_result)
|
||||
|
||||
async def _evaluate_metric_for_eval_case(
|
||||
self,
|
||||
eval_metric: EvalMetric,
|
||||
eval_case: EvalCase,
|
||||
inference_result: InferenceResult,
|
||||
eval_metric_result_per_invocation: list[EvalMetricResultPerInvocation],
|
||||
overall_eval_metric_results: list[EvalMetricResult],
|
||||
):
|
||||
"""Performs evaluation of a metric for a given eval case and inference result."""
|
||||
try:
|
||||
with client_label_context(EVAL_CLIENT_LABEL):
|
||||
evaluation_result = await self._evaluate_metric(
|
||||
eval_metric=eval_metric,
|
||||
actual_invocations=inference_result.inferences,
|
||||
expected_invocations=eval_case.conversation,
|
||||
conversation_scenario=eval_case.conversation_scenario,
|
||||
)
|
||||
except Exception as e:
|
||||
# We intentionally catch the Exception as we don't want failures to
|
||||
# affect other metric evaluation.
|
||||
logger.error(
|
||||
"Metric evaluation failed for metric `%s` for eval case id '%s'"
|
||||
' with following error `%s`',
|
||||
eval_metric.metric_name,
|
||||
eval_case.eval_id,
|
||||
e,
|
||||
exc_info=True,
|
||||
)
|
||||
# We use an empty result.
|
||||
evaluation_result = EvaluationResult(
|
||||
overall_eval_status=EvalStatus.NOT_EVALUATED
|
||||
)
|
||||
|
||||
# Track overall score across all invocations.
|
||||
eval_metric_result_details = EvalMetricResultDetails(
|
||||
rubric_scores=evaluation_result.overall_rubric_scores
|
||||
)
|
||||
overall_eval_metric_results.append(
|
||||
EvalMetricResult(
|
||||
score=evaluation_result.overall_score,
|
||||
eval_status=evaluation_result.overall_eval_status,
|
||||
details=eval_metric_result_details,
|
||||
**eval_metric.model_dump(),
|
||||
)
|
||||
)
|
||||
|
||||
if (
|
||||
evaluation_result.overall_eval_status != EvalStatus.NOT_EVALUATED
|
||||
and len(evaluation_result.per_invocation_results)
|
||||
!= len(eval_metric_result_per_invocation)
|
||||
):
|
||||
raise ValueError(
|
||||
'Eval metric should return results for each invocation. Found '
|
||||
f'{len(evaluation_result.per_invocation_results)} results for '
|
||||
f'{len(eval_metric_result_per_invocation)} invocations.'
|
||||
)
|
||||
|
||||
# Track score across individual invocations.
|
||||
for idx, invocation in enumerate(eval_metric_result_per_invocation):
|
||||
invocation_result = (
|
||||
evaluation_result.per_invocation_results[idx]
|
||||
if evaluation_result.overall_eval_status != EvalStatus.NOT_EVALUATED
|
||||
else PerInvocationResult(
|
||||
actual_invocation=invocation.actual_invocation
|
||||
)
|
||||
)
|
||||
eval_metric_result_details = EvalMetricResultDetails(
|
||||
rubric_scores=invocation_result.rubric_scores
|
||||
)
|
||||
invocation.eval_metric_results.append(
|
||||
EvalMetricResult(
|
||||
score=invocation_result.score,
|
||||
eval_status=invocation_result.eval_status,
|
||||
details=eval_metric_result_details,
|
||||
**eval_metric.model_dump(),
|
||||
)
|
||||
)
|
||||
|
||||
async def _evaluate_metric(
|
||||
self,
|
||||
eval_metric: EvalMetric,
|
||||
|
||||
@@ -328,28 +328,67 @@ class RubricBasedEvaluator(LlmAsJudge):
|
||||
assert self._criterion.rubrics, "Rubrics are required."
|
||||
|
||||
self._rubrics: list[Rubric] = self._criterion.rubrics
|
||||
self._effective_rubrics_list: Optional[list[Rubric]] = None
|
||||
|
||||
self._normalized_rubric_to_id_map = {
|
||||
_normalize_text(r.rubric_content.text_property): r.rubric_id
|
||||
for r in self._rubrics
|
||||
}
|
||||
|
||||
def create_effective_rubrics_list(
|
||||
self,
|
||||
invocation_rubrics: Optional[list[Rubric]],
|
||||
) -> None:
|
||||
rubrics_by_id = {}
|
||||
|
||||
def _add_rubrics(rubrics_to_add: list[Rubric], scope_name: str):
|
||||
for r in rubrics_to_add:
|
||||
if r.rubric_id in rubrics_by_id:
|
||||
raise ValueError(
|
||||
f"Rubric with rubric_id '{r.rubric_id}' already exists. Rubric"
|
||||
f" defined in {scope_name} conflicts with an existing rubric."
|
||||
)
|
||||
rubrics_by_id[r.rubric_id] = r
|
||||
|
||||
_add_rubrics(self._rubrics, "criterion")
|
||||
|
||||
if invocation_rubrics:
|
||||
_add_rubrics(invocation_rubrics, "invocation")
|
||||
|
||||
self._effective_rubrics_list = list(rubrics_by_id.values())
|
||||
|
||||
def get_effective_rubrics_list(self) -> list[Rubric]:
|
||||
"""Returns the effective rubrics list."""
|
||||
if self._effective_rubrics_list is None:
|
||||
raise ValueError(
|
||||
"Effective rubrics list not initialized. Call"
|
||||
" create_effective_rubrics_list() first."
|
||||
)
|
||||
return self._effective_rubrics_list
|
||||
|
||||
@override
|
||||
def convert_auto_rater_response_to_score(
|
||||
self, auto_rater_response: LlmResponse
|
||||
self,
|
||||
auto_rater_response: LlmResponse,
|
||||
) -> AutoRaterScore:
|
||||
"""Returns an AutoRaterScore generated from AutoRater's response."""
|
||||
response_text = get_text_from_content(auto_rater_response.content)
|
||||
rubric_responses = self._auto_rater_response_parser.parse(response_text)
|
||||
rubric_scores = []
|
||||
|
||||
normalized_rubric_to_rubric_map = {}
|
||||
for r in self.get_effective_rubrics_list():
|
||||
normalized_rubric_to_rubric_map[
|
||||
_normalize_text(r.rubric_content.text_property)
|
||||
] = r
|
||||
|
||||
for rubric_response in rubric_responses:
|
||||
normalized_rubric = _normalize_text(rubric_response.property_text)
|
||||
rubric_id = self._normalized_rubric_to_id_map.get(normalized_rubric, None)
|
||||
if rubric_id:
|
||||
normalized_rubric_text = _normalize_text(rubric_response.property_text)
|
||||
rubric = normalized_rubric_to_rubric_map.get(normalized_rubric_text, None)
|
||||
if rubric:
|
||||
rubric_scores.append(
|
||||
RubricScore(
|
||||
rubric_id=rubric_id,
|
||||
rubric_id=rubric.rubric_id,
|
||||
rationale=rubric_response.rationale,
|
||||
score=rubric_response.score,
|
||||
)
|
||||
|
||||
@@ -25,6 +25,7 @@ from .eval_case import Invocation
|
||||
from .eval_case import InvocationEvents
|
||||
from .eval_metrics import EvalMetric
|
||||
from .eval_metrics import RubricsBasedCriterion
|
||||
from .eval_rubrics import Rubric
|
||||
from .llm_as_judge_utils import get_text_from_content
|
||||
from .llm_as_judge_utils import get_tool_calls_and_responses_as_json_str
|
||||
from .llm_as_judge_utils import get_tool_declarations_as_json_str
|
||||
@@ -264,15 +265,19 @@ class RubricBasedFinalResponseQualityV1Evaluator(RubricBasedEvaluator):
|
||||
|
||||
@override
|
||||
def format_auto_rater_prompt(
|
||||
self, actual_invocation: Invocation, _: Optional[Invocation]
|
||||
self,
|
||||
actual_invocation: Invocation,
|
||||
_: Optional[Invocation],
|
||||
) -> str:
|
||||
"""Returns the autorater prompt."""
|
||||
|
||||
self.create_effective_rubrics_list(actual_invocation.rubrics)
|
||||
user_input = get_text_from_content(actual_invocation.user_content)
|
||||
final_response = get_text_from_content(actual_invocation.final_response)
|
||||
rubrics = "\n* ".join(
|
||||
[r.rubric_content.text_property for r in self._rubrics]
|
||||
)
|
||||
|
||||
rubrics_text = "\n".join([
|
||||
f"* {r.rubric_content.text_property}"
|
||||
for r in self._effective_rubrics_list
|
||||
])
|
||||
|
||||
developer_instructions = ""
|
||||
tool_declarations = "Agent has no tools."
|
||||
@@ -299,7 +304,7 @@ class RubricBasedFinalResponseQualityV1Evaluator(RubricBasedEvaluator):
|
||||
user_input=user_input,
|
||||
response_steps=response_steps,
|
||||
final_response=final_response,
|
||||
rubrics=rubrics,
|
||||
rubrics=rubrics_text,
|
||||
)
|
||||
|
||||
return auto_rater_prompt
|
||||
|
||||
@@ -164,17 +164,21 @@ class RubricBasedToolUseV1Evaluator(RubricBasedEvaluator):
|
||||
|
||||
@override
|
||||
def format_auto_rater_prompt(
|
||||
self, actual_invocation: Invocation, _: Optional[Invocation]
|
||||
self,
|
||||
actual_invocation: Invocation,
|
||||
_: Optional[Invocation],
|
||||
) -> str:
|
||||
"""Returns the autorater prompt."""
|
||||
|
||||
self.create_effective_rubrics_list(actual_invocation.rubrics)
|
||||
user_input = get_text_from_content(actual_invocation.user_content)
|
||||
tool_usage = get_tool_calls_and_responses_as_json_str(
|
||||
actual_invocation.intermediate_data
|
||||
)
|
||||
rubrics = "\n* ".join(
|
||||
[r.rubric_content.text_property for r in self._rubrics]
|
||||
)
|
||||
|
||||
rubrics_text = "\n".join([
|
||||
f"* {r.rubric_content.text_property}"
|
||||
for r in self._effective_rubrics_list
|
||||
])
|
||||
|
||||
app_details = actual_invocation.app_details
|
||||
tool_declarations = "Agent has no tools."
|
||||
@@ -185,5 +189,5 @@ class RubricBasedToolUseV1Evaluator(RubricBasedEvaluator):
|
||||
tool_declarations=tool_declarations,
|
||||
user_input=user_input,
|
||||
tool_usage=tool_usage,
|
||||
rubrics=rubrics,
|
||||
rubrics=rubrics_text,
|
||||
)
|
||||
|
||||
@@ -20,6 +20,7 @@ from google.adk.evaluation.eval_case import Invocation
|
||||
from google.adk.evaluation.eval_metrics import EvalMetric
|
||||
from google.adk.evaluation.eval_metrics import JudgeModelOptions
|
||||
from google.adk.evaluation.eval_metrics import LlmAsAJudgeCriterion
|
||||
from google.adk.evaluation.eval_rubrics import Rubric
|
||||
from google.adk.evaluation.evaluator import EvalStatus
|
||||
from google.adk.evaluation.evaluator import EvaluationResult
|
||||
from google.adk.evaluation.evaluator import PerInvocationResult
|
||||
@@ -35,12 +36,17 @@ import pytest
|
||||
class MockLlmAsJudge(LlmAsJudge):
|
||||
|
||||
def format_auto_rater_prompt(
|
||||
self, actual_invocation: Invocation, expected_invocation: Invocation
|
||||
self,
|
||||
actual_invocation: Invocation,
|
||||
expected_invocation: Optional[Invocation],
|
||||
rubrics: Optional[list[Rubric]] = None,
|
||||
) -> str:
|
||||
return "formatted prompt"
|
||||
|
||||
def convert_auto_rater_response_to_score(
|
||||
self, llm_response: LlmResponse
|
||||
self,
|
||||
llm_response: LlmResponse,
|
||||
rubrics: Optional[list[Rubric]] = None,
|
||||
) -> AutoRaterScore:
|
||||
return AutoRaterScore(score=1.0)
|
||||
|
||||
|
||||
@@ -34,6 +34,8 @@ from google.adk.evaluation.eval_metrics import Interval
|
||||
from google.adk.evaluation.eval_metrics import MetricInfo
|
||||
from google.adk.evaluation.eval_metrics import MetricValueInfo
|
||||
from google.adk.evaluation.eval_result import EvalCaseResult
|
||||
from google.adk.evaluation.eval_rubrics import Rubric
|
||||
from google.adk.evaluation.eval_rubrics import RubricContent
|
||||
from google.adk.evaluation.eval_set import EvalCase
|
||||
from google.adk.evaluation.eval_set import EvalSet
|
||||
from google.adk.evaluation.eval_set_results_manager import EvalSetResultsManager
|
||||
@@ -42,6 +44,9 @@ from google.adk.evaluation.evaluator import EvalStatus
|
||||
from google.adk.evaluation.evaluator import EvaluationResult
|
||||
from google.adk.evaluation.evaluator import Evaluator
|
||||
from google.adk.evaluation.evaluator import PerInvocationResult
|
||||
from google.adk.evaluation.local_eval_service import _add_rubrics_to_invocation
|
||||
from google.adk.evaluation.local_eval_service import _copy_eval_case_rubrics_to_actual_invocations
|
||||
from google.adk.evaluation.local_eval_service import _copy_invocation_rubrics_to_actual_invocations
|
||||
from google.adk.evaluation.local_eval_service import LocalEvalService
|
||||
from google.adk.evaluation.metric_evaluator_registry import DEFAULT_METRIC_EVALUATOR_REGISTRY
|
||||
from google.adk.models.registry import LLMRegistry
|
||||
@@ -678,3 +683,111 @@ async def test_mcp_stdio_agent_no_runtime_error(mocker):
|
||||
import shutil
|
||||
|
||||
shutil.rmtree(test_dir, ignore_errors=True)
|
||||
|
||||
|
||||
def test_add_rubrics_to_invocation_initializes_rubrics_list():
|
||||
invocation = Invocation(user_content=genai_types.Content())
|
||||
rubric = Rubric(
|
||||
rubric_id="r1", rubric_content=RubricContent(text_property="p1")
|
||||
)
|
||||
_add_rubrics_to_invocation(invocation, [rubric])
|
||||
assert invocation.rubrics == [rubric]
|
||||
|
||||
|
||||
def test_add_rubrics_to_invocation_adds_to_existing_list():
|
||||
rubric1 = Rubric(
|
||||
rubric_id="r1", rubric_content=RubricContent(text_property="p1")
|
||||
)
|
||||
rubric2 = Rubric(
|
||||
rubric_id="r2", rubric_content=RubricContent(text_property="p2")
|
||||
)
|
||||
invocation = Invocation(user_content=genai_types.Content(), rubrics=[rubric1])
|
||||
_add_rubrics_to_invocation(invocation, [rubric2])
|
||||
assert invocation.rubrics == [rubric1, rubric2]
|
||||
|
||||
|
||||
def test_add_rubrics_to_invocation_errors_on_duplicate_id():
|
||||
rubric1 = Rubric(
|
||||
rubric_id="r1", rubric_content=RubricContent(text_property="p1")
|
||||
)
|
||||
rubric2 = Rubric(
|
||||
rubric_id="r1", rubric_content=RubricContent(text_property="p2")
|
||||
)
|
||||
invocation = Invocation(user_content=genai_types.Content(), rubrics=[rubric1])
|
||||
with pytest.raises(ValueError):
|
||||
_add_rubrics_to_invocation(invocation, [rubric2])
|
||||
|
||||
|
||||
def test_copy_eval_case_rubrics_to_actual_invocations():
|
||||
rubric1 = Rubric(
|
||||
rubric_id="r1", rubric_content=RubricContent(text_property="p1")
|
||||
)
|
||||
eval_case = EvalCase(
|
||||
eval_id="case1",
|
||||
conversation=[
|
||||
Invocation(
|
||||
user_content=genai_types.Content(
|
||||
parts=[genai_types.Part(text="expected invocation 1.")]
|
||||
)
|
||||
),
|
||||
Invocation(
|
||||
user_content=genai_types.Content(
|
||||
parts=[genai_types.Part(text="expected invocation 2.")]
|
||||
)
|
||||
),
|
||||
],
|
||||
rubrics=[rubric1],
|
||||
)
|
||||
invocations = [
|
||||
Invocation(
|
||||
user_content=genai_types.Content(
|
||||
parts=[genai_types.Part(text="actual invocation 1.")]
|
||||
)
|
||||
),
|
||||
Invocation(
|
||||
user_content=genai_types.Content(
|
||||
parts=[genai_types.Part(text="actual invocation 2.")]
|
||||
)
|
||||
),
|
||||
]
|
||||
_copy_eval_case_rubrics_to_actual_invocations(eval_case, invocations)
|
||||
assert invocations[0].rubrics == [rubric1]
|
||||
assert invocations[1].rubrics == [rubric1]
|
||||
|
||||
|
||||
def test_copy_invocation_rubrics_to_actual_invocations():
|
||||
rubric1 = Rubric(
|
||||
rubric_id="r1", rubric_content=RubricContent(text_property="p1")
|
||||
)
|
||||
rubric2 = Rubric(
|
||||
rubric_id="r2", rubric_content=RubricContent(text_property="p2")
|
||||
)
|
||||
expected = [
|
||||
Invocation(
|
||||
user_content=genai_types.Content(
|
||||
parts=[genai_types.Part(text="expected invocation 1.")]
|
||||
),
|
||||
rubrics=[rubric1],
|
||||
),
|
||||
Invocation(
|
||||
user_content=genai_types.Content(
|
||||
parts=[genai_types.Part(text="expected invocation 2.")]
|
||||
),
|
||||
rubrics=[rubric2],
|
||||
),
|
||||
]
|
||||
actual = [
|
||||
Invocation(
|
||||
user_content=genai_types.Content(
|
||||
parts=[genai_types.Part(text="actual invocation 1.")]
|
||||
)
|
||||
),
|
||||
Invocation(
|
||||
user_content=genai_types.Content(
|
||||
parts=[genai_types.Part(text="actual invocation 2.")]
|
||||
)
|
||||
),
|
||||
]
|
||||
_copy_invocation_rubrics_to_actual_invocations(expected, actual)
|
||||
assert actual[0].rubrics == [rubric1]
|
||||
assert actual[1].rubrics == [rubric2]
|
||||
|
||||
@@ -465,6 +465,7 @@ class TestRubricBasedEvaluator:
|
||||
evaluator: RubricBasedEvaluator,
|
||||
):
|
||||
"""Tests convert_auto_rater_response_to_score with an empty response."""
|
||||
evaluator.create_effective_rubrics_list(None)
|
||||
response = LlmResponse(
|
||||
content=genai_types.Content(parts=[genai_types.Part(text="")])
|
||||
)
|
||||
@@ -477,6 +478,7 @@ class TestRubricBasedEvaluator:
|
||||
evaluator: RubricBasedEvaluator,
|
||||
):
|
||||
"""Tests convert_auto_rater_response_to_score with a malformed response."""
|
||||
evaluator.create_effective_rubrics_list(None)
|
||||
response = LlmResponse(
|
||||
content=genai_types.Content(
|
||||
parts=[genai_types.Part(text="This is not a valid format.")]
|
||||
@@ -491,6 +493,7 @@ class TestRubricBasedEvaluator:
|
||||
evaluator: RubricBasedEvaluator,
|
||||
):
|
||||
"""Tests convert_auto_rater_response_to_score with mixed verdicts."""
|
||||
evaluator.create_effective_rubrics_list(None)
|
||||
response_text = """
|
||||
Property: Is the response good?
|
||||
Rationale: It was good.
|
||||
@@ -515,6 +518,7 @@ class TestRubricBasedEvaluator:
|
||||
evaluator: RubricBasedEvaluator,
|
||||
):
|
||||
"""Tests convert_auto_rater_response_to_score with an invalid verdict."""
|
||||
evaluator.create_effective_rubrics_list(None)
|
||||
response_text = """
|
||||
Property: Is the response good?
|
||||
Rationale: It was good.
|
||||
@@ -539,6 +543,7 @@ class TestRubricBasedEvaluator:
|
||||
evaluator: RubricBasedEvaluator,
|
||||
):
|
||||
"""Tests convert_auto_rater_response_to_score with an unknown property."""
|
||||
evaluator.create_effective_rubrics_list(None)
|
||||
response_text = """
|
||||
Property: Is the response amazing?
|
||||
Rationale: It was amazing.
|
||||
@@ -551,4 +556,71 @@ class TestRubricBasedEvaluator:
|
||||
)
|
||||
auto_rater_score = evaluator.convert_auto_rater_response_to_score(response)
|
||||
assert auto_rater_score.score is None
|
||||
assert len(auto_rater_score.rubric_scores) == 0
|
||||
assert not auto_rater_score.rubric_scores
|
||||
|
||||
def test_create_effective_rubrics_list_with_invocation_rubrics(
|
||||
self, evaluator: RubricBasedEvaluator
|
||||
):
|
||||
invocation_rubrics = [
|
||||
Rubric(
|
||||
rubric_id="3",
|
||||
rubric_content=RubricContent(text_property="Invocation rubric"),
|
||||
)
|
||||
]
|
||||
evaluator.create_effective_rubrics_list(invocation_rubrics)
|
||||
effective_rubrics = evaluator.get_effective_rubrics_list()
|
||||
assert len(effective_rubrics) == 3
|
||||
assert {r.rubric_id for r in effective_rubrics} == {"1", "2", "3"}
|
||||
|
||||
def test_create_effective_rubrics_list_with_duplicate_invocation_rubric_id(
|
||||
self, evaluator: RubricBasedEvaluator
|
||||
):
|
||||
invocation_rubrics = [
|
||||
Rubric(
|
||||
rubric_id="1",
|
||||
rubric_content=RubricContent(text_property="Invocation rubric"),
|
||||
)
|
||||
]
|
||||
with pytest.raises(ValueError):
|
||||
evaluator.create_effective_rubrics_list(invocation_rubrics)
|
||||
|
||||
def test_create_effective_rubrics_list_with_no_invocation_rubrics(
|
||||
self, evaluator: RubricBasedEvaluator
|
||||
):
|
||||
evaluator.create_effective_rubrics_list(None)
|
||||
effective_rubrics = evaluator.get_effective_rubrics_list()
|
||||
assert len(effective_rubrics) == 2
|
||||
assert {r.rubric_id for r in effective_rubrics} == {"1", "2"}
|
||||
|
||||
def test_get_effective_rubrics_list_before_creation_raises_error(
|
||||
self, evaluator: RubricBasedEvaluator
|
||||
):
|
||||
with pytest.raises(
|
||||
ValueError, match="Effective rubrics list not initialized."
|
||||
):
|
||||
evaluator.get_effective_rubrics_list()
|
||||
|
||||
def test_create_effective_rubrics_list_multiple_calls(
|
||||
self, evaluator: RubricBasedEvaluator
|
||||
):
|
||||
invocation_rubrics1 = [
|
||||
Rubric(
|
||||
rubric_id="3",
|
||||
rubric_content=RubricContent(text_property="Invocation rubric 1"),
|
||||
)
|
||||
]
|
||||
evaluator.create_effective_rubrics_list(invocation_rubrics1)
|
||||
effective_rubrics1 = evaluator.get_effective_rubrics_list()
|
||||
assert len(effective_rubrics1) == 3
|
||||
assert {r.rubric_id for r in effective_rubrics1} == {"1", "2", "3"}
|
||||
|
||||
invocation_rubrics2 = [
|
||||
Rubric(
|
||||
rubric_id="4",
|
||||
rubric_content=RubricContent(text_property="Invocation rubric 2"),
|
||||
)
|
||||
]
|
||||
evaluator.create_effective_rubrics_list(invocation_rubrics2)
|
||||
effective_rubrics2 = evaluator.get_effective_rubrics_list()
|
||||
assert len(effective_rubrics2) == 3
|
||||
assert {r.rubric_id for r in effective_rubrics2} == {"1", "2", "4"}
|
||||
|
||||
Reference in New Issue
Block a user