From 9e3658b39c06380b031c6177964c9607009c683d Mon Sep 17 00:00:00 2001 From: Abhinav Maddineni Date: Thu, 12 Mar 2026 10:23:44 -0400 Subject: [PATCH 1/2] feat: Add ignore_args option to tool trajectory evaluation Allow bypassing argument matching in trajectory evaluation by setting ignore_args=True on ToolTrajectoryCriterion. Works with all match types (EXACT, IN_ORDER, ANY_ORDER). Closes #4794 --- src/google/adk/evaluation/eval_metrics.py | 7 + .../adk/evaluation/trajectory_evaluator.py | 16 +- .../evaluation/test_trajectory_evaluator.py | 327 ++++++++++++++++++ 3 files changed, 345 insertions(+), 5 deletions(-) diff --git a/src/google/adk/evaluation/eval_metrics.py b/src/google/adk/evaluation/eval_metrics.py index eb7c7e36cb..8a2821b905 100644 --- a/src/google/adk/evaluation/eval_metrics.py +++ b/src/google/adk/evaluation/eval_metrics.py @@ -226,6 +226,13 @@ class MatchType(Enum): ), ) + ignore_args: bool = Field( + default=False, + description=( + "If True, only tool names are compared; arguments are ignored." + ), + ) + @field_validator("match_type", mode="before") @classmethod def _coerce_match_type(cls, value: object) -> object: diff --git a/src/google/adk/evaluation/trajectory_evaluator.py b/src/google/adk/evaluation/trajectory_evaluator.py index 07626d7687..17bed7d522 100644 --- a/src/google/adk/evaluation/trajectory_evaluator.py +++ b/src/google/adk/evaluation/trajectory_evaluator.py @@ -82,6 +82,7 @@ def __init__( ) self._threshold = criterion.threshold self._match_type = criterion.match_type + self._ignore_args = criterion.ignore_args except ValidationError as e: expected_criterion_type_error = ValueError( f"`{eval_metric.metric_name}` metric expects a criterion of type" @@ -91,9 +92,11 @@ def __init__( elif eval_metric: self._threshold = eval_metric.threshold self._match_type = ToolTrajectoryCriterion.MatchType.EXACT + self._ignore_args = False else: self._threshold = threshold self._match_type = ToolTrajectoryCriterion.MatchType.EXACT + self._ignore_args = False @override def evaluate_invocations( @@ -191,9 +194,8 @@ def _are_tool_calls_in_order_match( try: current_expected = next(expected_it) for actual in actual_tool_calls: - if ( - actual.name == current_expected.name - and actual.args == current_expected.args + if actual.name == current_expected.name and ( + self._ignore_args or actual.args == current_expected.args ): current_expected = next(expected_it) except StopIteration: @@ -229,7 +231,9 @@ def _are_tool_calls_any_order_match( for expected in expected_tool_calls: found = False for i, actual in enumerate(actual_tool_calls_copy): - if actual.name == expected.name and actual.args == expected.args: + if actual.name == expected.name and ( + self._ignore_args or actual.args == expected.args + ): actual_tool_calls_copy.pop(i) found = True break @@ -260,7 +264,9 @@ def _are_tool_calls_exact_match( return False for actual, expected in zip(actual_tool_calls, expected_tool_calls): - if actual.name != expected.name or actual.args != expected.args: + if actual.name != expected.name or ( + not self._ignore_args and actual.args != expected.args + ): return False return True diff --git a/tests/unittests/evaluation/test_trajectory_evaluator.py b/tests/unittests/evaluation/test_trajectory_evaluator.py index 0fa3fa5a73..0f23170edb 100644 --- a/tests/unittests/evaluation/test_trajectory_evaluator.py +++ b/tests/unittests/evaluation/test_trajectory_evaluator.py @@ -462,3 +462,330 @@ def test_evaluate_invocations_no_invocations(evaluator: TrajectoryEvaluator): assert result.overall_score is None assert result.overall_eval_status == EvalStatus.NOT_EVALUATED assert not result.per_invocation_results + + +# --- ignore_args tests --- + + +def _make_ignore_args_evaluator( + match_type: ToolTrajectoryCriterion.MatchType, +) -> TrajectoryEvaluator: + return TrajectoryEvaluator( + eval_metric=EvalMetric( + metric_name=PrebuiltMetrics.TOOL_TRAJECTORY_AVG_SCORE.value, + criterion=ToolTrajectoryCriterion( + threshold=0.5, + match_type=match_type, + ignore_args=True, + ), + ) + ) + + +def test_exact_ignore_args_passes_with_different_args(): + ev = _make_ignore_args_evaluator(ToolTrajectoryCriterion.MatchType.EXACT) + actual = Invocation( + user_content=_USER_CONTENT, + intermediate_data=IntermediateData( + tool_uses=[ + genai_types.FunctionCall(name="t1", args={"a": 1}), + genai_types.FunctionCall(name="t2", args={"b": 2}), + ] + ), + ) + expected = Invocation( + user_content=_USER_CONTENT, + intermediate_data=IntermediateData( + tool_uses=[ + genai_types.FunctionCall(name="t1", args={"x": 99}), + genai_types.FunctionCall(name="t2", args={"y": 100}), + ] + ), + ) + result = ev.evaluate_invocations([actual], [expected]) + assert result.overall_score == 1.0 + + +def test_exact_ignore_args_fails_with_different_names(): + ev = _make_ignore_args_evaluator(ToolTrajectoryCriterion.MatchType.EXACT) + actual = Invocation( + user_content=_USER_CONTENT, + intermediate_data=IntermediateData( + tool_uses=[genai_types.FunctionCall(name="t1", args={})] + ), + ) + expected = Invocation( + user_content=_USER_CONTENT, + intermediate_data=IntermediateData( + tool_uses=[genai_types.FunctionCall(name="t2", args={})] + ), + ) + result = ev.evaluate_invocations([actual], [expected]) + assert result.overall_score == 0.0 + + +def test_in_order_ignore_args_passes_with_different_args(): + ev = _make_ignore_args_evaluator(ToolTrajectoryCriterion.MatchType.IN_ORDER) + actual = Invocation( + user_content=_USER_CONTENT, + intermediate_data=IntermediateData( + tool_uses=[ + genai_types.FunctionCall(name="t1", args={"a": 1}), + genai_types.FunctionCall(name="extra", args={}), + genai_types.FunctionCall(name="t2", args={"b": 2}), + ] + ), + ) + expected = Invocation( + user_content=_USER_CONTENT, + intermediate_data=IntermediateData( + tool_uses=[ + genai_types.FunctionCall(name="t1", args={"x": 99}), + genai_types.FunctionCall(name="t2", args={"y": 100}), + ] + ), + ) + result = ev.evaluate_invocations([actual], [expected]) + assert result.overall_score == 1.0 + + +def test_any_order_ignore_args_passes_with_different_args(): + ev = _make_ignore_args_evaluator(ToolTrajectoryCriterion.MatchType.ANY_ORDER) + actual = Invocation( + user_content=_USER_CONTENT, + intermediate_data=IntermediateData( + tool_uses=[ + genai_types.FunctionCall(name="t2", args={"b": 2}), + genai_types.FunctionCall(name="t1", args={"a": 1}), + ] + ), + ) + expected = Invocation( + user_content=_USER_CONTENT, + intermediate_data=IntermediateData( + tool_uses=[ + genai_types.FunctionCall(name="t1", args={"x": 99}), + genai_types.FunctionCall(name="t2", args={"y": 100}), + ] + ), + ) + result = ev.evaluate_invocations([actual], [expected]) + assert result.overall_score == 1.0 + + +def test_any_order_ignore_args_fails_with_missing_tool(): + ev = _make_ignore_args_evaluator(ToolTrajectoryCriterion.MatchType.ANY_ORDER) + actual = Invocation( + user_content=_USER_CONTENT, + intermediate_data=IntermediateData( + tool_uses=[genai_types.FunctionCall(name="t1", args={})] + ), + ) + expected = Invocation( + user_content=_USER_CONTENT, + intermediate_data=IntermediateData( + tool_uses=[ + genai_types.FunctionCall(name="t1", args={}), + genai_types.FunctionCall(name="t2", args={}), + ] + ), + ) + result = ev.evaluate_invocations([actual], [expected]) + assert result.overall_score == 0.0 + + +def test_ignore_args_from_dict_config(): + """Tests that ignore_args works when passed as a dict criterion.""" + eval_metric = EvalMetric( + metric_name=PrebuiltMetrics.TOOL_TRAJECTORY_AVG_SCORE.value, + criterion={ + "threshold": 0.5, + "match_type": "ANY_ORDER", + "ignore_args": True, + }, + ) + ev = TrajectoryEvaluator(eval_metric=eval_metric) + actual = Invocation( + user_content=_USER_CONTENT, + intermediate_data=IntermediateData( + tool_uses=[genai_types.FunctionCall(name="t1", args={"a": 1})] + ), + ) + expected = Invocation( + user_content=_USER_CONTENT, + intermediate_data=IntermediateData( + tool_uses=[genai_types.FunctionCall(name="t1", args={"z": 999})] + ), + ) + result = ev.evaluate_invocations([actual], [expected]) + assert result.overall_score == 1.0 + + +def test_exact_ignore_args_fails_with_different_count(): + """EXACT + ignore_args still fails when tool call counts differ.""" + ev = _make_ignore_args_evaluator(ToolTrajectoryCriterion.MatchType.EXACT) + actual = Invocation( + user_content=_USER_CONTENT, + intermediate_data=IntermediateData( + tool_uses=[ + genai_types.FunctionCall(name="t1", args={}), + genai_types.FunctionCall(name="t2", args={}), + ] + ), + ) + expected = Invocation( + user_content=_USER_CONTENT, + intermediate_data=IntermediateData( + tool_uses=[genai_types.FunctionCall(name="t1", args={})] + ), + ) + result = ev.evaluate_invocations([actual], [expected]) + assert result.overall_score == 0.0 + + +def test_in_order_ignore_args_fails_with_wrong_order(): + """IN_ORDER + ignore_args still fails when order is wrong.""" + ev = _make_ignore_args_evaluator(ToolTrajectoryCriterion.MatchType.IN_ORDER) + actual = Invocation( + user_content=_USER_CONTENT, + intermediate_data=IntermediateData( + tool_uses=[ + genai_types.FunctionCall(name="t2", args={}), + genai_types.FunctionCall(name="t1", args={}), + ] + ), + ) + expected = Invocation( + user_content=_USER_CONTENT, + intermediate_data=IntermediateData( + tool_uses=[ + genai_types.FunctionCall(name="t1", args={}), + genai_types.FunctionCall(name="t2", args={}), + ] + ), + ) + result = ev.evaluate_invocations([actual], [expected]) + assert result.overall_score == 0.0 + + +def test_in_order_ignore_args_fails_with_missing_tool(): + """IN_ORDER + ignore_args still fails when expected tool is missing.""" + ev = _make_ignore_args_evaluator(ToolTrajectoryCriterion.MatchType.IN_ORDER) + actual = Invocation( + user_content=_USER_CONTENT, + intermediate_data=IntermediateData( + tool_uses=[genai_types.FunctionCall(name="t1", args={})] + ), + ) + expected = Invocation( + user_content=_USER_CONTENT, + intermediate_data=IntermediateData( + tool_uses=[ + genai_types.FunctionCall(name="t1", args={}), + genai_types.FunctionCall(name="t2", args={}), + ] + ), + ) + result = ev.evaluate_invocations([actual], [expected]) + assert result.overall_score == 0.0 + + +def test_ignore_args_false_still_checks_args(): + """Confirm ignore_args=False (default) still enforces arg matching.""" + ev = TrajectoryEvaluator( + eval_metric=EvalMetric( + metric_name=PrebuiltMetrics.TOOL_TRAJECTORY_AVG_SCORE.value, + criterion=ToolTrajectoryCriterion( + threshold=0.5, + match_type=ToolTrajectoryCriterion.MatchType.EXACT, + ignore_args=False, + ), + ) + ) + actual = Invocation( + user_content=_USER_CONTENT, + intermediate_data=IntermediateData( + tool_uses=[genai_types.FunctionCall(name="t1", args={"a": 1})] + ), + ) + expected = Invocation( + user_content=_USER_CONTENT, + intermediate_data=IntermediateData( + tool_uses=[genai_types.FunctionCall(name="t1", args={"a": 2})] + ), + ) + result = ev.evaluate_invocations([actual], [expected]) + assert result.overall_score == 0.0 + + +def test_ignore_args_empty_tool_lists(): + """ignore_args with empty tool lists on both sides should pass.""" + ev = _make_ignore_args_evaluator(ToolTrajectoryCriterion.MatchType.EXACT) + inv = Invocation( + user_content=_USER_CONTENT, + intermediate_data=IntermediateData(tool_uses=[]), + ) + result = ev.evaluate_invocations([inv], [inv]) + assert result.overall_score == 1.0 + + +def test_ignore_args_multiple_invocations_mixed(): + """ignore_args with multiple invocations: one matches, one doesn't.""" + ev = _make_ignore_args_evaluator(ToolTrajectoryCriterion.MatchType.EXACT) + inv1_actual = Invocation( + user_content=_USER_CONTENT, + intermediate_data=IntermediateData( + tool_uses=[genai_types.FunctionCall(name="t1", args={"a": 1})] + ), + ) + inv1_expected = Invocation( + user_content=_USER_CONTENT, + intermediate_data=IntermediateData( + tool_uses=[genai_types.FunctionCall(name="t1", args={"z": 99})] + ), + ) + inv2_actual = Invocation( + user_content=_USER_CONTENT, + intermediate_data=IntermediateData( + tool_uses=[genai_types.FunctionCall(name="t1", args={})] + ), + ) + inv2_expected = Invocation( + user_content=_USER_CONTENT, + intermediate_data=IntermediateData( + tool_uses=[genai_types.FunctionCall(name="t2", args={})] + ), + ) + result = ev.evaluate_invocations( + [inv1_actual, inv2_actual], [inv1_expected, inv2_expected] + ) + assert result.overall_score == 0.5 + assert result.per_invocation_results[0].score == 1.0 + assert result.per_invocation_results[1].score == 0.0 + + +def test_ignore_args_with_camel_case_dict_config(): + """Tests ignore_args works via camelCase key (ignoreArgs) in dict.""" + eval_metric = EvalMetric( + metric_name=PrebuiltMetrics.TOOL_TRAJECTORY_AVG_SCORE.value, + criterion={ + "threshold": 0.5, + "matchType": "EXACT", + "ignoreArgs": True, + }, + ) + ev = TrajectoryEvaluator(eval_metric=eval_metric) + actual = Invocation( + user_content=_USER_CONTENT, + intermediate_data=IntermediateData( + tool_uses=[genai_types.FunctionCall(name="t1", args={"a": 1})] + ), + ) + expected = Invocation( + user_content=_USER_CONTENT, + intermediate_data=IntermediateData( + tool_uses=[genai_types.FunctionCall(name="t1", args={"z": 999})] + ), + ) + result = ev.evaluate_invocations([actual], [expected]) + assert result.overall_score == 1.0 From 02540e482a0413b2f91ce2113ae828fedfa5acf5 Mon Sep 17 00:00:00 2001 From: Abhinav Maddineni Date: Thu, 12 Mar 2026 21:36:10 -0400 Subject: [PATCH 2/2] refactor: Address review feedback - consolidate init and parametrize tests --- .../adk/evaluation/trajectory_evaluator.py | 6 +- .../evaluation/test_trajectory_evaluator.py | 329 +++++++----------- 2 files changed, 122 insertions(+), 213 deletions(-) diff --git a/src/google/adk/evaluation/trajectory_evaluator.py b/src/google/adk/evaluation/trajectory_evaluator.py index 17bed7d522..25720c4a52 100644 --- a/src/google/adk/evaluation/trajectory_evaluator.py +++ b/src/google/adk/evaluation/trajectory_evaluator.py @@ -89,12 +89,8 @@ def __init__( f" `{TrajectoryEvaluator.criterion_type}`." ) raise expected_criterion_type_error from e - elif eval_metric: - self._threshold = eval_metric.threshold - self._match_type = ToolTrajectoryCriterion.MatchType.EXACT - self._ignore_args = False else: - self._threshold = threshold + self._threshold = eval_metric.threshold if eval_metric else threshold self._match_type = ToolTrajectoryCriterion.MatchType.EXACT self._ignore_args = False diff --git a/tests/unittests/evaluation/test_trajectory_evaluator.py b/tests/unittests/evaluation/test_trajectory_evaluator.py index 0f23170edb..59e1b96b8c 100644 --- a/tests/unittests/evaluation/test_trajectory_evaluator.py +++ b/tests/unittests/evaluation/test_trajectory_evaluator.py @@ -482,212 +482,136 @@ def _make_ignore_args_evaluator( ) -def test_exact_ignore_args_passes_with_different_args(): - ev = _make_ignore_args_evaluator(ToolTrajectoryCriterion.MatchType.EXACT) - actual = Invocation( - user_content=_USER_CONTENT, - intermediate_data=IntermediateData( - tool_uses=[ - genai_types.FunctionCall(name="t1", args={"a": 1}), - genai_types.FunctionCall(name="t2", args={"b": 2}), - ] - ), - ) - expected = Invocation( - user_content=_USER_CONTENT, - intermediate_data=IntermediateData( - tool_uses=[ - genai_types.FunctionCall(name="t1", args={"x": 99}), - genai_types.FunctionCall(name="t2", args={"y": 100}), - ] - ), - ) - result = ev.evaluate_invocations([actual], [expected]) - assert result.overall_score == 1.0 - - -def test_exact_ignore_args_fails_with_different_names(): - ev = _make_ignore_args_evaluator(ToolTrajectoryCriterion.MatchType.EXACT) - actual = Invocation( - user_content=_USER_CONTENT, - intermediate_data=IntermediateData( - tool_uses=[genai_types.FunctionCall(name="t1", args={})] - ), - ) - expected = Invocation( - user_content=_USER_CONTENT, - intermediate_data=IntermediateData( - tool_uses=[genai_types.FunctionCall(name="t2", args={})] - ), - ) - result = ev.evaluate_invocations([actual], [expected]) - assert result.overall_score == 0.0 - - -def test_in_order_ignore_args_passes_with_different_args(): - ev = _make_ignore_args_evaluator(ToolTrajectoryCriterion.MatchType.IN_ORDER) - actual = Invocation( - user_content=_USER_CONTENT, - intermediate_data=IntermediateData( - tool_uses=[ - genai_types.FunctionCall(name="t1", args={"a": 1}), - genai_types.FunctionCall(name="extra", args={}), - genai_types.FunctionCall(name="t2", args={"b": 2}), - ] - ), - ) - expected = Invocation( - user_content=_USER_CONTENT, - intermediate_data=IntermediateData( - tool_uses=[ - genai_types.FunctionCall(name="t1", args={"x": 99}), - genai_types.FunctionCall(name="t2", args={"y": 100}), - ] - ), - ) - result = ev.evaluate_invocations([actual], [expected]) - assert result.overall_score == 1.0 - - -def test_any_order_ignore_args_passes_with_different_args(): - ev = _make_ignore_args_evaluator(ToolTrajectoryCriterion.MatchType.ANY_ORDER) - actual = Invocation( - user_content=_USER_CONTENT, - intermediate_data=IntermediateData( - tool_uses=[ - genai_types.FunctionCall(name="t2", args={"b": 2}), - genai_types.FunctionCall(name="t1", args={"a": 1}), - ] - ), - ) - expected = Invocation( - user_content=_USER_CONTENT, - intermediate_data=IntermediateData( - tool_uses=[ - genai_types.FunctionCall(name="t1", args={"x": 99}), - genai_types.FunctionCall(name="t2", args={"y": 100}), - ] - ), - ) - result = ev.evaluate_invocations([actual], [expected]) - assert result.overall_score == 1.0 - - -def test_any_order_ignore_args_fails_with_missing_tool(): - ev = _make_ignore_args_evaluator(ToolTrajectoryCriterion.MatchType.ANY_ORDER) - actual = Invocation( - user_content=_USER_CONTENT, - intermediate_data=IntermediateData( - tool_uses=[genai_types.FunctionCall(name="t1", args={})] - ), - ) - expected = Invocation( - user_content=_USER_CONTENT, - intermediate_data=IntermediateData( - tool_uses=[ - genai_types.FunctionCall(name="t1", args={}), - genai_types.FunctionCall(name="t2", args={}), - ] - ), - ) - result = ev.evaluate_invocations([actual], [expected]) - assert result.overall_score == 0.0 - - -def test_ignore_args_from_dict_config(): - """Tests that ignore_args works when passed as a dict criterion.""" - eval_metric = EvalMetric( - metric_name=PrebuiltMetrics.TOOL_TRAJECTORY_AVG_SCORE.value, - criterion={ - "threshold": 0.5, - "match_type": "ANY_ORDER", - "ignore_args": True, - }, - ) - ev = TrajectoryEvaluator(eval_metric=eval_metric) - actual = Invocation( - user_content=_USER_CONTENT, - intermediate_data=IntermediateData( - tool_uses=[genai_types.FunctionCall(name="t1", args={"a": 1})] - ), - ) - expected = Invocation( - user_content=_USER_CONTENT, - intermediate_data=IntermediateData( - tool_uses=[genai_types.FunctionCall(name="t1", args={"z": 999})] - ), - ) - result = ev.evaluate_invocations([actual], [expected]) - assert result.overall_score == 1.0 - - -def test_exact_ignore_args_fails_with_different_count(): - """EXACT + ignore_args still fails when tool call counts differ.""" - ev = _make_ignore_args_evaluator(ToolTrajectoryCriterion.MatchType.EXACT) - actual = Invocation( - user_content=_USER_CONTENT, - intermediate_data=IntermediateData( - tool_uses=[ - genai_types.FunctionCall(name="t1", args={}), - genai_types.FunctionCall(name="t2", args={}), - ] - ), - ) - expected = Invocation( - user_content=_USER_CONTENT, - intermediate_data=IntermediateData( - tool_uses=[genai_types.FunctionCall(name="t1", args={})] - ), - ) - result = ev.evaluate_invocations([actual], [expected]) - assert result.overall_score == 0.0 - - -def test_in_order_ignore_args_fails_with_wrong_order(): - """IN_ORDER + ignore_args still fails when order is wrong.""" - ev = _make_ignore_args_evaluator(ToolTrajectoryCriterion.MatchType.IN_ORDER) - actual = Invocation( - user_content=_USER_CONTENT, - intermediate_data=IntermediateData( - tool_uses=[ - genai_types.FunctionCall(name="t2", args={}), - genai_types.FunctionCall(name="t1", args={}), - ] - ), - ) - expected = Invocation( - user_content=_USER_CONTENT, - intermediate_data=IntermediateData( - tool_uses=[ - genai_types.FunctionCall(name="t1", args={}), - genai_types.FunctionCall(name="t2", args={}), - ] - ), - ) - result = ev.evaluate_invocations([actual], [expected]) - assert result.overall_score == 0.0 +_EXACT = ToolTrajectoryCriterion.MatchType.EXACT +_IN_ORDER = ToolTrajectoryCriterion.MatchType.IN_ORDER +_ANY_ORDER = ToolTrajectoryCriterion.MatchType.ANY_ORDER -def test_in_order_ignore_args_fails_with_missing_tool(): - """IN_ORDER + ignore_args still fails when expected tool is missing.""" - ev = _make_ignore_args_evaluator(ToolTrajectoryCriterion.MatchType.IN_ORDER) +@pytest.mark.parametrize( + ("match_type", "actual_tools", "expected_tools", "expected_score"), + [ + # EXACT: different args, same names -> pass + ( + _EXACT, + [ + genai_types.FunctionCall(name="t1", args={"a": 1}), + genai_types.FunctionCall(name="t2", args={"b": 2}), + ], + [ + genai_types.FunctionCall(name="t1", args={"x": 99}), + genai_types.FunctionCall(name="t2", args={"y": 100}), + ], + 1.0, + ), + # EXACT: different names -> fail + ( + _EXACT, + [genai_types.FunctionCall(name="t1", args={})], + [genai_types.FunctionCall(name="t2", args={})], + 0.0, + ), + # EXACT: different tool count -> fail + ( + _EXACT, + [ + genai_types.FunctionCall(name="t1", args={}), + genai_types.FunctionCall(name="t2", args={}), + ], + [genai_types.FunctionCall(name="t1", args={})], + 0.0, + ), + # EXACT: empty lists -> pass + ( + _EXACT, + [], + [], + 1.0, + ), + # IN_ORDER: different args with extra tools -> pass + ( + _IN_ORDER, + [ + genai_types.FunctionCall(name="t1", args={"a": 1}), + genai_types.FunctionCall(name="extra", args={}), + genai_types.FunctionCall(name="t2", args={"b": 2}), + ], + [ + genai_types.FunctionCall(name="t1", args={"x": 99}), + genai_types.FunctionCall(name="t2", args={"y": 100}), + ], + 1.0, + ), + # IN_ORDER: wrong order -> fail + ( + _IN_ORDER, + [ + genai_types.FunctionCall(name="t2", args={}), + genai_types.FunctionCall(name="t1", args={}), + ], + [ + genai_types.FunctionCall(name="t1", args={}), + genai_types.FunctionCall(name="t2", args={}), + ], + 0.0, + ), + # IN_ORDER: missing tool -> fail + ( + _IN_ORDER, + [genai_types.FunctionCall(name="t1", args={})], + [ + genai_types.FunctionCall(name="t1", args={}), + genai_types.FunctionCall(name="t2", args={}), + ], + 0.0, + ), + # ANY_ORDER: different args, swapped order -> pass + ( + _ANY_ORDER, + [ + genai_types.FunctionCall(name="t2", args={"b": 2}), + genai_types.FunctionCall(name="t1", args={"a": 1}), + ], + [ + genai_types.FunctionCall(name="t1", args={"x": 99}), + genai_types.FunctionCall(name="t2", args={"y": 100}), + ], + 1.0, + ), + # ANY_ORDER: missing tool -> fail + ( + _ANY_ORDER, + [genai_types.FunctionCall(name="t1", args={})], + [ + genai_types.FunctionCall(name="t1", args={}), + genai_types.FunctionCall(name="t2", args={}), + ], + 0.0, + ), + ], + ids=[ + "exact_different_args_pass", + "exact_different_names_fail", + "exact_different_count_fail", + "exact_empty_lists_pass", + "in_order_different_args_pass", + "in_order_wrong_order_fail", + "in_order_missing_tool_fail", + "any_order_different_args_pass", + "any_order_missing_tool_fail", + ], +) +def test_ignore_args(match_type, actual_tools, expected_tools, expected_score): + ev = _make_ignore_args_evaluator(match_type) actual = Invocation( user_content=_USER_CONTENT, - intermediate_data=IntermediateData( - tool_uses=[genai_types.FunctionCall(name="t1", args={})] - ), + intermediate_data=IntermediateData(tool_uses=actual_tools), ) expected = Invocation( user_content=_USER_CONTENT, - intermediate_data=IntermediateData( - tool_uses=[ - genai_types.FunctionCall(name="t1", args={}), - genai_types.FunctionCall(name="t2", args={}), - ] - ), + intermediate_data=IntermediateData(tool_uses=expected_tools), ) result = ev.evaluate_invocations([actual], [expected]) - assert result.overall_score == 0.0 + assert result.overall_score == expected_score def test_ignore_args_false_still_checks_args(): @@ -718,17 +642,6 @@ def test_ignore_args_false_still_checks_args(): assert result.overall_score == 0.0 -def test_ignore_args_empty_tool_lists(): - """ignore_args with empty tool lists on both sides should pass.""" - ev = _make_ignore_args_evaluator(ToolTrajectoryCriterion.MatchType.EXACT) - inv = Invocation( - user_content=_USER_CONTENT, - intermediate_data=IntermediateData(tool_uses=[]), - ) - result = ev.evaluate_invocations([inv], [inv]) - assert result.overall_score == 1.0 - - def test_ignore_args_multiple_invocations_mixed(): """ignore_args with multiple invocations: one matches, one doesn't.""" ev = _make_ignore_args_evaluator(ToolTrajectoryCriterion.MatchType.EXACT)