Skip to content

Commit 6668a79

Browse files
committed
fix(eval): make _EvalMetricResultWithInvocation.expected_invocation optional
conversation_scenario eval cases intentionally pass expected_invocation=None from local_eval_service (matching the public EvalMetricResultPerInvocation model), but the private _EvalMetricResultWithInvocation required a non-None Invocation, causing a pydantic ValidationError. Changes: - Make expected_invocation Optional[Invocation] with default None - Guard attribute access in _print_details when expected_invocation is None Fixes #5214
1 parent f973673 commit 6668a79

File tree

2 files changed

+174
-1
lines changed

2 files changed

+174
-1
lines changed

src/google/adk/evaluation/agent_evaluator.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ class _EvalMetricResultWithInvocation(BaseModel):
9090
"""
9191

9292
actual_invocation: Invocation
93-
expected_invocation: Invocation
93+
expected_invocation: Optional[Invocation] = None
9494
eval_metric_result: EvalMetricResult
9595

9696

@@ -438,15 +438,21 @@ def _print_details(
438438
"threshold": threshold,
439439
"prompt": AgentEvaluator._convert_content_to_text(
440440
per_invocation_result.expected_invocation.user_content
441+
if per_invocation_result.expected_invocation
442+
else None
441443
),
442444
"expected_response": AgentEvaluator._convert_content_to_text(
443445
per_invocation_result.expected_invocation.final_response
446+
if per_invocation_result.expected_invocation
447+
else None
444448
),
445449
"actual_response": AgentEvaluator._convert_content_to_text(
446450
per_invocation_result.actual_invocation.final_response
447451
),
448452
"expected_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
449453
per_invocation_result.expected_invocation.intermediate_data
454+
if per_invocation_result.expected_invocation
455+
else None
450456
),
451457
"actual_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
452458
per_invocation_result.actual_invocation.intermediate_data
Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
# Copyright 2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""Regression tests for _EvalMetricResultWithInvocation None-handling.
16+
17+
Covers the bug described in https://github.com/google/adk-python/issues/5214
18+
where passing expected_invocation=None (the normal path for
19+
conversation_scenario eval cases) caused a pydantic ValidationError.
20+
"""
21+
22+
from __future__ import annotations
23+
24+
from unittest.mock import patch
25+
26+
import pytest
27+
28+
from google.genai import types as genai_types
29+
30+
from google.adk.evaluation.agent_evaluator import AgentEvaluator
31+
from google.adk.evaluation.agent_evaluator import (
32+
_EvalMetricResultWithInvocation,
33+
)
34+
from google.adk.evaluation.eval_case import Invocation
35+
from google.adk.evaluation.eval_metrics import EvalMetricResult
36+
from google.adk.evaluation.eval_metrics import EvalMetricResultPerInvocation
37+
from google.adk.evaluation.eval_metrics import EvalStatus
38+
from google.adk.evaluation.eval_result import EvalCaseResult
39+
40+
41+
# ---------------------------------------------------------------------------
42+
# Helpers
43+
# ---------------------------------------------------------------------------
44+
45+
def _make_invocation(**overrides) -> Invocation:
46+
"""Return a minimal Invocation instance."""
47+
defaults = {
48+
"user_content": genai_types.Content(
49+
role="user", parts=[genai_types.Part(text="hello")]
50+
),
51+
}
52+
defaults.update(overrides)
53+
return Invocation(**defaults)
54+
55+
56+
def _make_eval_metric_result(
57+
metric_name: str = "test_metric",
58+
score: float = 1.0,
59+
status: EvalStatus = EvalStatus.PASSED,
60+
) -> EvalMetricResult:
61+
return EvalMetricResult(
62+
metric_name=metric_name,
63+
score=score,
64+
eval_status=status,
65+
)
66+
67+
68+
# ---------------------------------------------------------------------------
69+
# Tests: _EvalMetricResultWithInvocation accepts None
70+
# ---------------------------------------------------------------------------
71+
72+
class TestEvalMetricResultWithInvocationNone:
73+
"""Regression: expected_invocation=None must be accepted (issue #5214)."""
74+
75+
def test_construction_with_none_expected_invocation(self):
76+
"""_EvalMetricResultWithInvocation should accept None for expected_invocation."""
77+
result = _EvalMetricResultWithInvocation(
78+
actual_invocation=_make_invocation(),
79+
expected_invocation=None,
80+
eval_metric_result=_make_eval_metric_result(),
81+
)
82+
assert result.expected_invocation is None
83+
84+
def test_construction_with_omitted_expected_invocation(self):
85+
"""expected_invocation should default to None when omitted."""
86+
result = _EvalMetricResultWithInvocation(
87+
actual_invocation=_make_invocation(),
88+
eval_metric_result=_make_eval_metric_result(),
89+
)
90+
assert result.expected_invocation is None
91+
92+
def test_construction_with_real_expected_invocation(self):
93+
"""Normal case: providing a real Invocation should still work."""
94+
inv = _make_invocation()
95+
result = _EvalMetricResultWithInvocation(
96+
actual_invocation=_make_invocation(),
97+
expected_invocation=inv,
98+
eval_metric_result=_make_eval_metric_result(),
99+
)
100+
assert result.expected_invocation is inv
101+
102+
103+
# ---------------------------------------------------------------------------
104+
# Tests: _get_eval_metric_results_with_invocation passes None through
105+
# ---------------------------------------------------------------------------
106+
107+
class TestGetEvalMetricResultsWithNone:
108+
"""_get_eval_metric_results_with_invocation must propagate None."""
109+
110+
def test_none_expected_invocation_propagated(self):
111+
actual = _make_invocation()
112+
metric_result = _make_eval_metric_result(metric_name="m1")
113+
114+
eval_case_result = EvalCaseResult(
115+
eval_set_id="test_set",
116+
eval_id="scenario_1",
117+
final_eval_status=EvalStatus.PASSED,
118+
overall_eval_metric_results=[metric_result],
119+
eval_metric_result_per_invocation=[
120+
EvalMetricResultPerInvocation(
121+
actual_invocation=actual,
122+
expected_invocation=None,
123+
eval_metric_results=[metric_result],
124+
)
125+
],
126+
session_id="sess-1",
127+
)
128+
129+
grouped = AgentEvaluator._get_eval_metric_results_with_invocation(
130+
[eval_case_result]
131+
)
132+
133+
assert "m1" in grouped
134+
assert len(grouped["m1"]) == 1
135+
assert grouped["m1"][0].expected_invocation is None
136+
assert grouped["m1"][0].actual_invocation is actual
137+
138+
139+
# ---------------------------------------------------------------------------
140+
# Tests: _print_details does not crash when expected_invocation is None
141+
# ---------------------------------------------------------------------------
142+
143+
class TestPrintDetailsNoneExpected:
144+
"""_print_details must handle None expected_invocation gracefully."""
145+
146+
def test_print_details_with_none_expected(self):
147+
actual = _make_invocation()
148+
metric_result = _make_eval_metric_result(score=0.9)
149+
150+
items = [
151+
_EvalMetricResultWithInvocation(
152+
actual_invocation=actual,
153+
expected_invocation=None,
154+
eval_metric_result=metric_result,
155+
)
156+
]
157+
158+
# _print_details prints to stdout via tabulate/pandas — we just
159+
# verify it doesn't raise.
160+
with patch("builtins.print"):
161+
AgentEvaluator._print_details(
162+
eval_metric_result_with_invocations=items,
163+
overall_eval_status=EvalStatus.PASSED,
164+
overall_score=0.9,
165+
metric_name="test_metric",
166+
threshold=0.5,
167+
)

0 commit comments

Comments
 (0)