diff --git a/src/google/adk/cli/cli_eval.py b/src/google/adk/cli/cli_eval.py index cce160ae..7176199b 100644 --- a/src/google/adk/cli/cli_eval.py +++ b/src/google/adk/cli/cli_eval.py @@ -201,9 +201,11 @@ def pretty_print_eval_result(eval_result: EvalCaseResult): for r in metric_result.criterion.rubrics } for rubric_score in metric_result.details.rubric_scores: - rubric = rubrics_by_id.get(rubric_score.rubric_id) + rubric_text = rubrics_by_id.get(rubric_score.rubric_id) + if not rubric_text: + rubric_text = rubric_score.rubric_id click.echo( - f"Rubric: {rubric}, " + f"Rubric: {rubric_text}, " f"Score: {rubric_score.score}, " f"Reasoning: {rubric_score.rationale}" ) @@ -243,6 +245,8 @@ def pretty_print_eval_result(eval_result: EvalCaseResult): } for rubric_score in metric_result.details.rubric_scores: rubric = rubrics_by_id.get(rubric_score.rubric_id) + if not rubric: + rubric = rubric_score.rubric_id row_data[f"Rubric: {rubric}"] = ( f"Reasoning: {rubric_score.rationale}, " f"Score: {rubric_score.score}" diff --git a/src/google/adk/evaluation/local_eval_service.py b/src/google/adk/evaluation/local_eval_service.py index bc1d3886..7031266e 100644 --- a/src/google/adk/evaluation/local_eval_service.py +++ b/src/google/adk/evaluation/local_eval_service.py @@ -46,6 +46,7 @@ from .eval_metrics import EvalMetric from .eval_metrics import EvalMetricResult from .eval_metrics import EvalMetricResultDetails from .eval_metrics import EvalMetricResultPerInvocation +from .eval_metrics import Rubric from .eval_result import EvalCaseResult from .eval_set import EvalCase from .eval_set_results_manager import EvalSetResultsManager @@ -67,6 +68,46 @@ def _get_session_id() -> str: return f'{EVAL_SESSION_ID_PREFIX}{str(uuid.uuid4())}' +def _add_rubrics_to_invocation( + invocation: Invocation, rubrics_to_add: list[Rubric] +): + """Adds rubrics to invocation, throwing ValueError on duplicate rubric_id.""" + if not invocation.rubrics: + invocation.rubrics = [] + existing_ids = {r.rubric_id for r in invocation.rubrics} + for rubric in rubrics_to_add: + if rubric.rubric_id in existing_ids: + raise ValueError( + f"Rubric with rubric_id '{rubric.rubric_id}' already exists." + ) + invocation.rubrics.append(rubric) + existing_ids.add(rubric.rubric_id) + + +def _copy_eval_case_rubrics_to_actual_invocations( + eval_case: EvalCase, actual_invocations: list[Invocation] +): + """Copies EvalCase level rubrics to all actual invocations.""" + if hasattr(eval_case, 'rubrics') and eval_case.rubrics: + for invocation in actual_invocations: + _add_rubrics_to_invocation(invocation, eval_case.rubrics) + + +def _copy_invocation_rubrics_to_actual_invocations( + expected_invocations: Optional[list[Invocation]], + actual_invocations: list[Invocation], +): + """Copies invocation level rubrics to corresponding actual invocations.""" + if expected_invocations: + for actual_invocation, expected_invocation in zip( + actual_invocations, expected_invocations + ): + if expected_invocation.rubrics: + _add_rubrics_to_invocation( + actual_invocation, expected_invocation.rubrics + ) + + @experimental class LocalEvalService(BaseEvalService): """An implementation of BaseEvalService, that runs the evals locally.""" @@ -249,76 +290,27 @@ class LocalEvalService(BaseEvalService): ) ) + actual_invocations = inference_result.inferences + expected_invocations = eval_case.conversation + + # 1. Copy EvalCase level rubrics to all actual invocations. + _copy_eval_case_rubrics_to_actual_invocations(eval_case, actual_invocations) + + # 2. If expected invocations are present, copy invocation level + # rubrics to corresponding actual invocations. + _copy_invocation_rubrics_to_actual_invocations( + expected_invocations, actual_invocations + ) + for eval_metric in evaluate_config.eval_metrics: # Perform evaluation of the metric. - try: - with client_label_context(EVAL_CLIENT_LABEL): - evaluation_result = await self._evaluate_metric( - eval_metric=eval_metric, - actual_invocations=inference_result.inferences, - expected_invocations=eval_case.conversation, - conversation_scenario=eval_case.conversation_scenario, - ) - except Exception as e: - # We intentionally catch the Exception as we don't want failures to - # affect other metric evaluation. - logger.error( - "Metric evaluation failed for metric `%s` for eval case id '%s'" - ' with following error `%s`', - eval_metric.metric_name, - eval_case.eval_id, - e, - exc_info=True, - ) - # We use an empty result. - evaluation_result = EvaluationResult( - overall_eval_status=EvalStatus.NOT_EVALUATED - ) - - # Track overall score across all invocations. - eval_metric_result_details = EvalMetricResultDetails( - rubric_scores=evaluation_result.overall_rubric_scores + await self._evaluate_metric_for_eval_case( + eval_metric, + eval_case, + inference_result, + eval_metric_result_per_invocation, + overall_eval_metric_results, ) - overall_eval_metric_results.append( - EvalMetricResult( - score=evaluation_result.overall_score, - eval_status=evaluation_result.overall_eval_status, - details=eval_metric_result_details, - **eval_metric.model_dump(), - ) - ) - - if ( - evaluation_result.overall_eval_status != EvalStatus.NOT_EVALUATED - and len(evaluation_result.per_invocation_results) - != len(eval_metric_result_per_invocation) - ): - raise ValueError( - 'Eval metric should return results for each invocation. Found ' - f'{len(evaluation_result.per_invocation_results)} results for ' - f'{len(eval_metric_result_per_invocation)} invocations.' - ) - - # Track score across individual invocations. - for idx, invocation in enumerate(eval_metric_result_per_invocation): - invocation_result = ( - evaluation_result.per_invocation_results[idx] - if evaluation_result.overall_eval_status != EvalStatus.NOT_EVALUATED - else PerInvocationResult( - actual_invocation=invocation.actual_invocation - ) - ) - eval_metric_result_details = EvalMetricResultDetails( - rubric_scores=invocation_result.rubric_scores - ) - invocation.eval_metric_results.append( - EvalMetricResult( - score=invocation_result.score, - eval_status=invocation_result.eval_status, - details=eval_metric_result_details, - **eval_metric.model_dump(), - ) - ) final_eval_status = self._generate_final_eval_status( overall_eval_metric_results @@ -342,6 +334,84 @@ class LocalEvalService(BaseEvalService): return (inference_result, eval_case_result) + async def _evaluate_metric_for_eval_case( + self, + eval_metric: EvalMetric, + eval_case: EvalCase, + inference_result: InferenceResult, + eval_metric_result_per_invocation: list[EvalMetricResultPerInvocation], + overall_eval_metric_results: list[EvalMetricResult], + ): + """Performs evaluation of a metric for a given eval case and inference result.""" + try: + with client_label_context(EVAL_CLIENT_LABEL): + evaluation_result = await self._evaluate_metric( + eval_metric=eval_metric, + actual_invocations=inference_result.inferences, + expected_invocations=eval_case.conversation, + conversation_scenario=eval_case.conversation_scenario, + ) + except Exception as e: + # We intentionally catch the Exception as we don't want failures to + # affect other metric evaluation. + logger.error( + "Metric evaluation failed for metric `%s` for eval case id '%s'" + ' with following error `%s`', + eval_metric.metric_name, + eval_case.eval_id, + e, + exc_info=True, + ) + # We use an empty result. + evaluation_result = EvaluationResult( + overall_eval_status=EvalStatus.NOT_EVALUATED + ) + + # Track overall score across all invocations. + eval_metric_result_details = EvalMetricResultDetails( + rubric_scores=evaluation_result.overall_rubric_scores + ) + overall_eval_metric_results.append( + EvalMetricResult( + score=evaluation_result.overall_score, + eval_status=evaluation_result.overall_eval_status, + details=eval_metric_result_details, + **eval_metric.model_dump(), + ) + ) + + if ( + evaluation_result.overall_eval_status != EvalStatus.NOT_EVALUATED + and len(evaluation_result.per_invocation_results) + != len(eval_metric_result_per_invocation) + ): + raise ValueError( + 'Eval metric should return results for each invocation. Found ' + f'{len(evaluation_result.per_invocation_results)} results for ' + f'{len(eval_metric_result_per_invocation)} invocations.' + ) + + # Track score across individual invocations. + for idx, invocation in enumerate(eval_metric_result_per_invocation): + invocation_result = ( + evaluation_result.per_invocation_results[idx] + if evaluation_result.overall_eval_status != EvalStatus.NOT_EVALUATED + else PerInvocationResult( + actual_invocation=invocation.actual_invocation + ) + ) + eval_metric_result_details = EvalMetricResultDetails( + rubric_scores=invocation_result.rubric_scores + ) + invocation.eval_metric_results.append( + EvalMetricResult( + score=invocation_result.score, + eval_status=invocation_result.eval_status, + details=eval_metric_result_details, + **eval_metric.model_dump(), + ) + ) + async def _evaluate_metric( self, eval_metric: EvalMetric, diff --git a/src/google/adk/evaluation/rubric_based_evaluator.py b/src/google/adk/evaluation/rubric_based_evaluator.py index 1d361cb1..f21453c0 100644 --- a/src/google/adk/evaluation/rubric_based_evaluator.py +++ b/src/google/adk/evaluation/rubric_based_evaluator.py @@ -328,28 +328,67 @@ class RubricBasedEvaluator(LlmAsJudge): assert self._criterion.rubrics, "Rubrics are required." self._rubrics: list[Rubric] = self._criterion.rubrics + self._effective_rubrics_list: Optional[list[Rubric]] = None self._normalized_rubric_to_id_map = { _normalize_text(r.rubric_content.text_property): r.rubric_id for r in self._rubrics } + def create_effective_rubrics_list( + self, + invocation_rubrics: Optional[list[Rubric]], + ) -> None: + rubrics_by_id = {} + + def _add_rubrics(rubrics_to_add: list[Rubric], scope_name: str): + for r in rubrics_to_add: + if r.rubric_id in rubrics_by_id: + raise ValueError( + f"Rubric with rubric_id '{r.rubric_id}' already exists. Rubric" + f" defined in {scope_name} conflicts with an existing rubric." + ) + rubrics_by_id[r.rubric_id] = r + + _add_rubrics(self._rubrics, "criterion") + + if invocation_rubrics: + _add_rubrics(invocation_rubrics, "invocation") + + self._effective_rubrics_list = list(rubrics_by_id.values()) + + def get_effective_rubrics_list(self) -> list[Rubric]: + """Returns the effective rubrics list.""" + if self._effective_rubrics_list is None: + raise ValueError( + "Effective rubrics list not initialized. Call" + " create_effective_rubrics_list() first." + ) + return self._effective_rubrics_list + @override def convert_auto_rater_response_to_score( - self, auto_rater_response: LlmResponse + self, + auto_rater_response: LlmResponse, ) -> AutoRaterScore: """Returns an AutoRaterScore generated from AutoRater's response.""" response_text = get_text_from_content(auto_rater_response.content) rubric_responses = self._auto_rater_response_parser.parse(response_text) rubric_scores = [] + normalized_rubric_to_rubric_map = {} + for r in self.get_effective_rubrics_list(): + normalized_rubric_to_rubric_map[ + _normalize_text(r.rubric_content.text_property) + ] = r + for rubric_response in rubric_responses: - normalized_rubric = _normalize_text(rubric_response.property_text) - rubric_id = self._normalized_rubric_to_id_map.get(normalized_rubric, None) - if rubric_id: + normalized_rubric_text = _normalize_text(rubric_response.property_text) + rubric = normalized_rubric_to_rubric_map.get(normalized_rubric_text, None) + if rubric: rubric_scores.append( RubricScore( - rubric_id=rubric_id, + rubric_id=rubric.rubric_id, rationale=rubric_response.rationale, score=rubric_response.score, ) diff --git a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py index 90f02d3b..a2816f3a 100644 --- a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py +++ b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py @@ -25,6 +25,7 @@ from .eval_case import Invocation from .eval_case import InvocationEvents from .eval_metrics import EvalMetric from .eval_metrics import RubricsBasedCriterion +from .eval_rubrics import Rubric from .llm_as_judge_utils import get_text_from_content from .llm_as_judge_utils import get_tool_calls_and_responses_as_json_str from .llm_as_judge_utils import get_tool_declarations_as_json_str @@ -264,15 +265,19 @@ class RubricBasedFinalResponseQualityV1Evaluator(RubricBasedEvaluator): @override def format_auto_rater_prompt( - self, actual_invocation: Invocation, _: Optional[Invocation] + self, + actual_invocation: Invocation, + _: Optional[Invocation], ) -> str: """Returns the autorater prompt.""" - + self.create_effective_rubrics_list(actual_invocation.rubrics) user_input = get_text_from_content(actual_invocation.user_content) final_response = get_text_from_content(actual_invocation.final_response) - rubrics = "\n* ".join( - [r.rubric_content.text_property for r in self._rubrics] - ) + + rubrics_text = "\n".join([ + f"* {r.rubric_content.text_property}" + for r in self._effective_rubrics_list + ]) developer_instructions = "" tool_declarations = "Agent has no tools." @@ -299,7 +304,7 @@ class RubricBasedFinalResponseQualityV1Evaluator(RubricBasedEvaluator): user_input=user_input, response_steps=response_steps, final_response=final_response, - rubrics=rubrics, + rubrics=rubrics_text, ) return auto_rater_prompt diff --git a/src/google/adk/evaluation/rubric_based_tool_use_quality_v1.py b/src/google/adk/evaluation/rubric_based_tool_use_quality_v1.py index bb64124e..ad7e7194 100644 --- a/src/google/adk/evaluation/rubric_based_tool_use_quality_v1.py +++ b/src/google/adk/evaluation/rubric_based_tool_use_quality_v1.py @@ -164,17 +164,21 @@ class RubricBasedToolUseV1Evaluator(RubricBasedEvaluator): @override def format_auto_rater_prompt( - self, actual_invocation: Invocation, _: Optional[Invocation] + self, + actual_invocation: Invocation, + _: Optional[Invocation], ) -> str: """Returns the autorater prompt.""" - + self.create_effective_rubrics_list(actual_invocation.rubrics) user_input = get_text_from_content(actual_invocation.user_content) tool_usage = get_tool_calls_and_responses_as_json_str( actual_invocation.intermediate_data ) - rubrics = "\n* ".join( - [r.rubric_content.text_property for r in self._rubrics] - ) + + rubrics_text = "\n".join([ + f"* {r.rubric_content.text_property}" + for r in self._effective_rubrics_list + ]) app_details = actual_invocation.app_details tool_declarations = "Agent has no tools." @@ -185,5 +189,5 @@ class RubricBasedToolUseV1Evaluator(RubricBasedEvaluator): tool_declarations=tool_declarations, user_input=user_input, tool_usage=tool_usage, - rubrics=rubrics, + rubrics=rubrics_text, ) diff --git a/tests/unittests/evaluation/test_llm_as_judge.py b/tests/unittests/evaluation/test_llm_as_judge.py index eb5a1154..d2f75da5 100644 --- a/tests/unittests/evaluation/test_llm_as_judge.py +++ b/tests/unittests/evaluation/test_llm_as_judge.py @@ -20,6 +20,7 @@ from google.adk.evaluation.eval_case import Invocation from google.adk.evaluation.eval_metrics import EvalMetric from google.adk.evaluation.eval_metrics import JudgeModelOptions from google.adk.evaluation.eval_metrics import LlmAsAJudgeCriterion +from google.adk.evaluation.eval_rubrics import Rubric from google.adk.evaluation.evaluator import EvalStatus from google.adk.evaluation.evaluator import EvaluationResult from google.adk.evaluation.evaluator import PerInvocationResult @@ -35,12 +36,17 @@ import pytest class MockLlmAsJudge(LlmAsJudge): def format_auto_rater_prompt( - self, actual_invocation: Invocation, expected_invocation: Invocation + self, + actual_invocation: Invocation, + expected_invocation: Optional[Invocation], + rubrics: Optional[list[Rubric]] = None, ) -> str: return "formatted prompt" def convert_auto_rater_response_to_score( - self, llm_response: LlmResponse + self, + llm_response: LlmResponse, + rubrics: Optional[list[Rubric]] = None, ) -> AutoRaterScore: return AutoRaterScore(score=1.0) diff --git a/tests/unittests/evaluation/test_local_eval_service.py b/tests/unittests/evaluation/test_local_eval_service.py index da5a1736..08ef2aa8 100644 --- a/tests/unittests/evaluation/test_local_eval_service.py +++ b/tests/unittests/evaluation/test_local_eval_service.py @@ -34,6 +34,8 @@ from google.adk.evaluation.eval_metrics import Interval from google.adk.evaluation.eval_metrics import MetricInfo from google.adk.evaluation.eval_metrics import MetricValueInfo from google.adk.evaluation.eval_result import EvalCaseResult +from google.adk.evaluation.eval_rubrics import Rubric +from google.adk.evaluation.eval_rubrics import RubricContent from google.adk.evaluation.eval_set import EvalCase from google.adk.evaluation.eval_set import EvalSet from google.adk.evaluation.eval_set_results_manager import EvalSetResultsManager @@ -42,6 +44,9 @@ from google.adk.evaluation.evaluator import EvalStatus from google.adk.evaluation.evaluator import EvaluationResult from google.adk.evaluation.evaluator import Evaluator from google.adk.evaluation.evaluator import PerInvocationResult +from google.adk.evaluation.local_eval_service import _add_rubrics_to_invocation +from google.adk.evaluation.local_eval_service import _copy_eval_case_rubrics_to_actual_invocations +from google.adk.evaluation.local_eval_service import _copy_invocation_rubrics_to_actual_invocations from google.adk.evaluation.local_eval_service import LocalEvalService from google.adk.evaluation.metric_evaluator_registry import DEFAULT_METRIC_EVALUATOR_REGISTRY from google.adk.models.registry import LLMRegistry @@ -678,3 +683,111 @@ async def test_mcp_stdio_agent_no_runtime_error(mocker): import shutil shutil.rmtree(test_dir, ignore_errors=True) + + +def test_add_rubrics_to_invocation_initializes_rubrics_list(): + invocation = Invocation(user_content=genai_types.Content()) + rubric = Rubric( + rubric_id="r1", rubric_content=RubricContent(text_property="p1") + ) + _add_rubrics_to_invocation(invocation, [rubric]) + assert invocation.rubrics == [rubric] + + +def test_add_rubrics_to_invocation_adds_to_existing_list(): + rubric1 = Rubric( + rubric_id="r1", rubric_content=RubricContent(text_property="p1") + ) + rubric2 = Rubric( + rubric_id="r2", rubric_content=RubricContent(text_property="p2") + ) + invocation = Invocation(user_content=genai_types.Content(), rubrics=[rubric1]) + _add_rubrics_to_invocation(invocation, [rubric2]) + assert invocation.rubrics == [rubric1, rubric2] + + +def test_add_rubrics_to_invocation_errors_on_duplicate_id(): + rubric1 = Rubric( + rubric_id="r1", rubric_content=RubricContent(text_property="p1") + ) + rubric2 = Rubric( + rubric_id="r1", rubric_content=RubricContent(text_property="p2") + ) + invocation = Invocation(user_content=genai_types.Content(), rubrics=[rubric1]) + with pytest.raises(ValueError): + _add_rubrics_to_invocation(invocation, [rubric2]) + + +def test_copy_eval_case_rubrics_to_actual_invocations(): + rubric1 = Rubric( + rubric_id="r1", rubric_content=RubricContent(text_property="p1") + ) + eval_case = EvalCase( + eval_id="case1", + conversation=[ + Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text="expected invocation 1.")] + ) + ), + Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text="expected invocation 2.")] + ) + ), + ], + rubrics=[rubric1], + ) + invocations = [ + Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text="actual invocation 1.")] + ) + ), + Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text="actual invocation 2.")] + ) + ), + ] + _copy_eval_case_rubrics_to_actual_invocations(eval_case, invocations) + assert invocations[0].rubrics == [rubric1] + assert invocations[1].rubrics == [rubric1] + + +def test_copy_invocation_rubrics_to_actual_invocations(): + rubric1 = Rubric( + rubric_id="r1", rubric_content=RubricContent(text_property="p1") + ) + rubric2 = Rubric( + rubric_id="r2", rubric_content=RubricContent(text_property="p2") + ) + expected = [ + Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text="expected invocation 1.")] + ), + rubrics=[rubric1], + ), + Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text="expected invocation 2.")] + ), + rubrics=[rubric2], + ), + ] + actual = [ + Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text="actual invocation 1.")] + ) + ), + Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text="actual invocation 2.")] + ) + ), + ] + _copy_invocation_rubrics_to_actual_invocations(expected, actual) + assert actual[0].rubrics == [rubric1] + assert actual[1].rubrics == [rubric2] diff --git a/tests/unittests/evaluation/test_rubric_based_evaluator.py b/tests/unittests/evaluation/test_rubric_based_evaluator.py index b538b757..1f731f92 100644 --- a/tests/unittests/evaluation/test_rubric_based_evaluator.py +++ b/tests/unittests/evaluation/test_rubric_based_evaluator.py @@ -465,6 +465,7 @@ class TestRubricBasedEvaluator: evaluator: RubricBasedEvaluator, ): """Tests convert_auto_rater_response_to_score with an empty response.""" + evaluator.create_effective_rubrics_list(None) response = LlmResponse( content=genai_types.Content(parts=[genai_types.Part(text="")]) ) @@ -477,6 +478,7 @@ class TestRubricBasedEvaluator: evaluator: RubricBasedEvaluator, ): """Tests convert_auto_rater_response_to_score with a malformed response.""" + evaluator.create_effective_rubrics_list(None) response = LlmResponse( content=genai_types.Content( parts=[genai_types.Part(text="This is not a valid format.")] @@ -491,6 +493,7 @@ class TestRubricBasedEvaluator: evaluator: RubricBasedEvaluator, ): """Tests convert_auto_rater_response_to_score with mixed verdicts.""" + evaluator.create_effective_rubrics_list(None) response_text = """ Property: Is the response good? Rationale: It was good. @@ -515,6 +518,7 @@ class TestRubricBasedEvaluator: evaluator: RubricBasedEvaluator, ): """Tests convert_auto_rater_response_to_score with an invalid verdict.""" + evaluator.create_effective_rubrics_list(None) response_text = """ Property: Is the response good? Rationale: It was good. @@ -539,6 +543,7 @@ class TestRubricBasedEvaluator: evaluator: RubricBasedEvaluator, ): """Tests convert_auto_rater_response_to_score with an unknown property.""" + evaluator.create_effective_rubrics_list(None) response_text = """ Property: Is the response amazing? Rationale: It was amazing. @@ -551,4 +556,71 @@ class TestRubricBasedEvaluator: ) auto_rater_score = evaluator.convert_auto_rater_response_to_score(response) assert auto_rater_score.score is None - assert len(auto_rater_score.rubric_scores) == 0 + assert not auto_rater_score.rubric_scores + + def test_create_effective_rubrics_list_with_invocation_rubrics( + self, evaluator: RubricBasedEvaluator + ): + invocation_rubrics = [ + Rubric( + rubric_id="3", + rubric_content=RubricContent(text_property="Invocation rubric"), + ) + ] + evaluator.create_effective_rubrics_list(invocation_rubrics) + effective_rubrics = evaluator.get_effective_rubrics_list() + assert len(effective_rubrics) == 3 + assert {r.rubric_id for r in effective_rubrics} == {"1", "2", "3"} + + def test_create_effective_rubrics_list_with_duplicate_invocation_rubric_id( + self, evaluator: RubricBasedEvaluator + ): + invocation_rubrics = [ + Rubric( + rubric_id="1", + rubric_content=RubricContent(text_property="Invocation rubric"), + ) + ] + with pytest.raises(ValueError): + evaluator.create_effective_rubrics_list(invocation_rubrics) + + def test_create_effective_rubrics_list_with_no_invocation_rubrics( + self, evaluator: RubricBasedEvaluator + ): + evaluator.create_effective_rubrics_list(None) + effective_rubrics = evaluator.get_effective_rubrics_list() + assert len(effective_rubrics) == 2 + assert {r.rubric_id for r in effective_rubrics} == {"1", "2"} + + def test_get_effective_rubrics_list_before_creation_raises_error( + self, evaluator: RubricBasedEvaluator + ): + with pytest.raises( + ValueError, match="Effective rubrics list not initialized." + ): + evaluator.get_effective_rubrics_list() + + def test_create_effective_rubrics_list_multiple_calls( + self, evaluator: RubricBasedEvaluator + ): + invocation_rubrics1 = [ + Rubric( + rubric_id="3", + rubric_content=RubricContent(text_property="Invocation rubric 1"), + ) + ] + evaluator.create_effective_rubrics_list(invocation_rubrics1) + effective_rubrics1 = evaluator.get_effective_rubrics_list() + assert len(effective_rubrics1) == 3 + assert {r.rubric_id for r in effective_rubrics1} == {"1", "2", "3"} + + invocation_rubrics2 = [ + Rubric( + rubric_id="4", + rubric_content=RubricContent(text_property="Invocation rubric 2"), + ) + ] + evaluator.create_effective_rubrics_list(invocation_rubrics2) + effective_rubrics2 = evaluator.get_effective_rubrics_list() + assert len(effective_rubrics2) == 3 + assert {r.rubric_id for r in effective_rubrics2} == {"1", "2", "4"}