adk-python/src/google/adk/evaluation/eval_metrics.py at 1f9f4f553f6f949257119ebd0e41953c7678c812 · Siddhartha90/adk-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
# Copyright 2026 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import abc
from enum import Enum
from typing import Optional
from typing import Union

from google.genai import types as genai_types
from pydantic import alias_generators
from pydantic import BaseModel
from pydantic import ConfigDict
from pydantic import Field
from pydantic import field_validator
from pydantic.json_schema import SkipJsonSchema
from typing_extensions import TypeAlias

from .common import EvalBaseModel
from .eval_case import Invocation
from .eval_rubrics import Rubric
from .eval_rubrics import RubricScore


class EvalStatus(Enum):
  PASSED = 1
  FAILED = 2
  NOT_EVALUATED = 3


class PrebuiltMetrics(Enum):
  TOOL_TRAJECTORY_AVG_SCORE = "tool_trajectory_avg_score"

  RESPONSE_EVALUATION_SCORE = "response_evaluation_score"

  RESPONSE_MATCH_SCORE = "response_match_score"

  SAFETY_V1 = "safety_v1"

  FINAL_RESPONSE_MATCH_V2 = "final_response_match_v2"

  RUBRIC_BASED_FINAL_RESPONSE_QUALITY_V1 = (
      "rubric_based_final_response_quality_v1"
  )

  HALLUCINATIONS_V1 = "hallucinations_v1"

  RUBRIC_BASED_TOOL_USE_QUALITY_V1 = "rubric_based_tool_use_quality_v1"

  PER_TURN_USER_SIMULATOR_QUALITY_V1 = "per_turn_user_simulator_quality_v1"

  MULTI_TURN_TASK_SUCCESS_V1 = "multi_turn_task_success_v1"

  MULTI_TURN_TRAJECTORY_QUALITY_V1 = "multi_turn_trajectory_quality_v1"

  MULTI_TURN_TOOL_USE_QUALITY_V1 = "multi_turn_tool_use_quality_v1"


MetricName: TypeAlias = Union[str, PrebuiltMetrics]
Threshold: TypeAlias = float


class JudgeModelOptions(EvalBaseModel):
  """Options for an eval metric's judge model."""

  judge_model: str = Field(
      default="gemini-2.5-flash",
      description=(
          "The judge model to use for evaluation. It can be a model name."
      ),
  )

  judge_model_config: SkipJsonSchema[
      Optional[genai_types.GenerateContentConfig]
  ] = Field(
      default=None,
      description="The configuration for the judge model.",
  )

  num_samples: int = Field(
      default=5,
      description=(
          "The number of times to sample the model for each invocation"
          " evaluation. Given that models tend to have certain degree of"
          " unreliability to them, we repeatedly sample them with the same"
          " data. These repeated invocation are them aggregated using some"
          " strategy. From experimentation, we have found 5 to be a good"
          " default."
      ),
  )


class BaseCriterion(BaseModel):
  """Base criterion to use for an Eval Metric."""

  model_config = ConfigDict(
      alias_generator=alias_generators.to_camel,
      populate_by_name=True,
      extra="allow",
  )

  threshold: Threshold = Field(
      description="The threshold to be used by the metric.",
  )


class LlmAsAJudgeCriterion(BaseCriterion):
  """Criterion when using LLM-As-A-Judge metric."""

  judge_model_options: JudgeModelOptions = Field(
      default_factory=JudgeModelOptions,
      description="Options for the judge model.",
  )


class RubricsBasedCriterion(BaseCriterion):
  """Criterion when using a rubric based metric."""

  judge_model_options: JudgeModelOptions = Field(
      default_factory=JudgeModelOptions,
      description="Options for the judge model.",
  )

  rubrics: list[Rubric] = Field(
      default_factory=list,
      description=(
          "Rubrics to be used by Metric. Not all metrics rely on rubrics, but"
          " metrics like `rubric_based_final_response_quality_v1` do. Metrics"
          " that don't use Rubrics, will just ignore this field, if specified."
          " Metrics that do use rubrics will raise an exception, if they are"
          " not specified."
      ),
  )

  evaluate_full_response: bool = Field(
      default=False,
      description=(
          "Whether to evaluate the full agent response including intermediate"
          " natural language text (e.g. text emitted before tool calls) in"
          " addition to the final response. By default, only the final"
          " response text is sent to the judge. When True, text from all"
          " intermediate invocation events is concatenated with the final"
          " response before evaluation. This is useful for agents that emit"
          " text both before and after tool calls within a single invocation."
      ),
  )


class HallucinationsCriterion(BaseCriterion):
  """Criterion to use when evaluating agents response for hallucinations."""

  judge_model_options: JudgeModelOptions = Field(
      default_factory=JudgeModelOptions,
      description="Options for the judge model.",
  )

  evaluate_intermediate_nl_responses: bool = Field(
      default=False,
      description=(
          "Whether any intermediate NL responses should be evaluated"
          " for hallucinations or not. By default, the metric only evaluates"
          " final response from the Agent for hallucinations."
      ),
  )


class ToolTrajectoryCriterion(BaseCriterion):
  """Criterion to use when evaluating agent's tool trajectories with a reference one."""

  class MatchType(Enum):
    """The type of Match between actual and expected tool call trajectories."""

    EXACT = 0
    """Requires a perfect match between the actual and expected tool calls."""

    IN_ORDER = 1
    """Requires the actual tool calls to be in the same order as expected tools,
    with allowance for extra tool calls to have happened.

    This criteria is useful in assuring if certain key actions/tool calls
    occur and in certain order, leaving some scope for other tools calls to
    happen as well.

    Example 1: Set of actual vs expected tool calls that satisfies the criteria:

      Expected tools calls: [T1, T2, T3]
      Actual tool calls: [T1, T1.1, T2, T2.1, T2.2, T3, T3.1]

      This satisfies, as the tools T1, T2 and T3 happened in the "Actual" and in
      the same order.

    Example 2: Set of actual vs expected tool calls that don't satisfy the
    criteria:

      Expected tools calls: [T1, T2, T3, T4]
      Actual tool calls: [T1, T1.1, T2, T2.1, T2.2, T3, T3.1]

      While the tool calls T1, T2 and T3 happened in the "Actual" and in
      the same order as "Expected", but the tool calls T4 is missing.
    """

    ANY_ORDER = 2
    """Requires the actual tool calls to be in the any order as expected tools,
    with allowance for extra tool calls to have happened.

    This criteria is helpful for cases where multiple tool calls about the same
    concept occur, like your agent issues 5 search queries. You don't really
    care the order in which the search queries are issues, till they occur.

    Example 1: Set of actual vs expected tool calls that satisfies the criteria:

      Expected tools calls: [T1, T2, T3]
      Actual tool calls: [T2, T2.1, T1, T1.1, T1.2, T3, T3.1]

      This satisfies, as the tools T1, T2 and T3 happened in the "Actual" and
      are also present in expected. Note that the order is different.

    Example 2: Set of actual vs expected tool calls that don't satisfy the
    criteria:

      Expected tools calls: [T1, T2, T3, T4]
      Actual tool calls: [T1, T1.1, T2, T2.1, T2.2, T3, T3.1]

      While the tool calls T1, T2 and T3 happened in the "Actual" and in
      the same order as "Expected", but the tool calls T4 is missing.
    """

  match_type: MatchType = Field(
      default=MatchType.EXACT,
      description=(
          "The type of Match between actual and expected tool call"
          " trajectories."
      ),
  )

  @field_validator("match_type", mode="before")
  @classmethod
  def _coerce_match_type(cls, value: object) -> object:
    if isinstance(value, cls.MatchType):
      return value
    if isinstance(value, str):
      normalized = value.strip().upper().replace("-", "_").replace(" ", "_")
      if normalized in cls.MatchType.__members__:
        return cls.MatchType[normalized]
    return value


class LlmBackedUserSimulatorCriterion(LlmAsAJudgeCriterion):
  """Criterion for LLM-backed User Simulator Evaluators."""

  stop_signal: str = Field(
      default="</finished>",
      description=(
          "Stop signal to validate the successful completion of a conversation."
          " For optimal performance, this should match the one in the User"
          " Simulator."
      ),
  )


class EvalMetric(EvalBaseModel):
  """A metric used to evaluate a particular aspect of an eval case."""

  metric_name: str = Field(
      description="The name of the metric.",
  )

  threshold: Optional[float] = Field(
      default=None,
      description=(
          "This field will be deprecated soon. Please use `criterion` instead."
          " A threshold value. Each metric decides how to interpret this"
          " threshold."
      ),
  )

  criterion: Optional[BaseCriterion] = Field(
      default=None, description="""Evaluation criterion used by the metric."""
  )

  custom_function_path: Optional[str] = Field(
      default=None,
      description="""Path to custom function, if this is a custom metric.""",
  )


class EvalMetricResultDetails(EvalBaseModel):
  rubric_scores: Optional[list[RubricScore]] = Field(
      default=None,
      description=(
          "The scores obtained after applying the rubrics to the Agent's"
          " response."
      ),
  )


class EvalMetricResult(EvalMetric):
  """The actual computed score/value of a particular EvalMetric."""

  score: Optional[float] = Field(
      default=None,
      description=(
          "Score obtained after evaluating the metric. Optional, as evaluation"
          " might not have happened."
      ),
  )

  eval_status: EvalStatus = Field(description="The status of this evaluation.")

  details: EvalMetricResultDetails = Field(
      default_factory=EvalMetricResultDetails, description=""""""
  )


class EvalMetricResultPerInvocation(EvalBaseModel):
  """Eval metric results per invocation."""

  actual_invocation: Invocation = Field(
      description=(
          "The actual invocation, usually obtained by inferencing the agent."
      )
  )

  expected_invocation: Optional[Invocation] = Field(
      default=None,
      description=(
          "The expected invocation, usually the reference or golden invocation."
      ),
  )

  eval_metric_results: list[EvalMetricResult] = Field(
      default=[],
      description="Eval results for each applicable metric.",
  )


class Interval(EvalBaseModel):
  """Represents a range of numeric values, e.g. [0 ,1] or (2,3) or [-1, 6)."""

  min_value: float = Field(description="The smaller end of the interval.")

  open_at_min: bool = Field(
      default=False,
      description=(
          "The interval is Open on the min end. The default value is False,"
          " which means that we assume that the interval is Closed."
      ),
  )

  max_value: float = Field(description="The larger end of the interval.")

  open_at_max: bool = Field(
      default=False,
      description=(
          "The interval is Open on the max end. The default value is False,"
          " which means that we assume that the interval is Closed."
      ),
  )


class MetricValueInfo(EvalBaseModel):
  """Information about the type of metric value."""

  interval: Optional[Interval] = Field(
      default=None,
      description="The values represented by the metric are of type interval.",
  )


class MetricInfo(EvalBaseModel):
  """Information about the metric that are used for Evals."""

  metric_name: str = Field(description="The name of the metric.")

  description: str = Field(
      default=None, description="A 2 to 3 line description of the metric."
  )

  metric_value_info: MetricValueInfo = Field(
      description="Information on the nature of values supported by the metric."
  )


class MetricInfoProvider(abc.ABC):
  """Interface for providing MetricInfo."""

  @abc.abstractmethod
  def get_metric_info(self) -> MetricInfo:
    """Returns MetricInfo for a given metric."""
    raise NotImplementedError