Skip to content

Commit de15ea9

Browse files
jsondaicopybara-github
authored andcommitted
chore: GenAI Client(evals) - Improve retry budget, add jitter, and expand retryable errors
PiperOrigin-RevId: 897754825
1 parent 7142c62 commit de15ea9

3 files changed

Lines changed: 245 additions & 103 deletions

File tree

tests/unit/vertexai/genai/test_evals.py

Lines changed: 135 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6668,6 +6668,8 @@ def test_predefined_metric_retry_fail_on_resource_exhausted(
66686668
genai_errors.ClientError(code=429, response_json=error_response_json),
66696669
genai_errors.ClientError(code=429, response_json=error_response_json),
66706670
genai_errors.ClientError(code=429, response_json=error_response_json),
6671+
genai_errors.ClientError(code=429, response_json=error_response_json),
6672+
genai_errors.ClientError(code=429, response_json=error_response_json),
66716673
]
66726674

66736675
result = _evals_common._execute_evaluation(
@@ -6676,18 +6678,13 @@ def test_predefined_metric_retry_fail_on_resource_exhausted(
66766678
metrics=[metric],
66776679
)
66786680

6679-
assert mock_private_evaluate_instances.call_count == 3
6680-
assert mock_sleep.call_count == 2
6681+
assert mock_private_evaluate_instances.call_count == 5
6682+
assert mock_sleep.call_count == 4
66816683
assert len(result.summary_metrics) == 1
66826684
summary_metric = result.summary_metrics[0]
66836685
assert summary_metric.metric_name == "summarization_quality"
66846686
assert summary_metric.mean_score is None
66856687
assert summary_metric.num_cases_error == 1
6686-
assert (
6687-
"Judge model resource exhausted after 3 retries"
6688-
) in result.eval_case_results[0].response_candidate_results[0].metric_results[
6689-
"summarization_quality"
6690-
].error_message
66916688

66926689

66936690
class TestEvaluationDataset:
@@ -7258,3 +7255,134 @@ def test_rate_limiter_no_sleep_when_enough_time_passed(self):
72587255
elapsed = real_time.time() - start
72597256
# 5 calls at 1000 QPS should take ~0.005s, certainly under 1s
72607257
assert elapsed < 1.0
7258+
7259+
7260+
class TestCallWithRetry:
7261+
"""Tests for the shared _call_with_retry helper."""
7262+
7263+
@mock.patch("time.sleep", return_value=None)
7264+
def test_call_with_retry_success_on_first_try(self, mock_sleep):
7265+
"""Tests that _call_with_retry returns immediately on success."""
7266+
fn = mock.Mock(return_value="success")
7267+
result = _evals_metric_handlers._call_with_retry(fn, "test_metric")
7268+
assert result == "success"
7269+
assert fn.call_count == 1
7270+
assert mock_sleep.call_count == 0
7271+
7272+
@mock.patch("time.sleep", return_value=None)
7273+
def test_call_with_retry_success_after_retries(self, mock_sleep):
7274+
"""Tests that _call_with_retry succeeds after transient failures."""
7275+
error_json = {"error": {"code": 429, "message": "exhausted"}}
7276+
fn = mock.Mock(
7277+
side_effect=[
7278+
genai_errors.ClientError(code=429, response_json=error_json),
7279+
genai_errors.ClientError(code=429, response_json=error_json),
7280+
"success",
7281+
]
7282+
)
7283+
result = _evals_metric_handlers._call_with_retry(fn, "test_metric")
7284+
assert result == "success"
7285+
assert fn.call_count == 3
7286+
assert mock_sleep.call_count == 2
7287+
7288+
@mock.patch("time.sleep", return_value=None)
7289+
def test_call_with_retry_raises_after_max_retries(self, mock_sleep):
7290+
"""Tests that _call_with_retry raises after exhausting retries."""
7291+
error_json = {"error": {"code": 429, "message": "exhausted"}}
7292+
fn = mock.Mock(
7293+
side_effect=genai_errors.ClientError(code=429, response_json=error_json)
7294+
)
7295+
with pytest.raises(genai_errors.ClientError):
7296+
_evals_metric_handlers._call_with_retry(fn, "test_metric")
7297+
assert fn.call_count == 5 # _MAX_RETRIES
7298+
assert mock_sleep.call_count == 4
7299+
7300+
@mock.patch("time.sleep", return_value=None)
7301+
def test_call_with_retry_retries_on_server_error(self, mock_sleep):
7302+
"""Tests retry on 503 ServiceUnavailable (ServerError)."""
7303+
error_json = {"error": {"code": 503, "message": "unavailable"}}
7304+
fn = mock.Mock(
7305+
side_effect=[
7306+
genai_errors.ServerError(code=503, response_json=error_json),
7307+
"success",
7308+
]
7309+
)
7310+
result = _evals_metric_handlers._call_with_retry(fn, "test_metric")
7311+
assert result == "success"
7312+
assert fn.call_count == 2
7313+
7314+
@mock.patch("time.sleep", return_value=None)
7315+
def test_call_with_retry_no_retry_on_non_retryable(self, mock_sleep):
7316+
"""Tests that non-retryable errors are raised immediately."""
7317+
error_json = {"error": {"code": 400, "message": "bad request"}}
7318+
fn = mock.Mock(
7319+
side_effect=genai_errors.ClientError(code=400, response_json=error_json)
7320+
)
7321+
with pytest.raises(genai_errors.ClientError):
7322+
_evals_metric_handlers._call_with_retry(fn, "test_metric")
7323+
assert fn.call_count == 1
7324+
assert mock_sleep.call_count == 0
7325+
7326+
7327+
class TestComputationMetricRetry:
7328+
"""Tests for retry behavior in ComputationMetricHandler."""
7329+
7330+
@mock.patch.object(
7331+
_evals_metric_handlers.ComputationMetricHandler,
7332+
"SUPPORTED_COMPUTATION_METRICS",
7333+
frozenset(["bleu"]),
7334+
)
7335+
@mock.patch("time.sleep", return_value=None)
7336+
# fmt: off
7337+
@mock.patch(
7338+
"vertexai._genai.evals.Evals.evaluate_instances"
7339+
)
7340+
# fmt: on
7341+
def test_computation_metric_retry_on_resource_exhausted(
7342+
self,
7343+
mock_evaluate_instances,
7344+
mock_sleep,
7345+
mock_api_client_fixture,
7346+
):
7347+
"""Tests that ComputationMetricHandler retries on 429."""
7348+
dataset_df = pd.DataFrame(
7349+
[
7350+
{
7351+
"prompt": "Test prompt",
7352+
"response": "Test response",
7353+
"reference": "Test reference",
7354+
}
7355+
]
7356+
)
7357+
input_dataset = vertexai_genai_types.EvaluationDataset(
7358+
eval_dataset_df=dataset_df
7359+
)
7360+
metric = vertexai_genai_types.Metric(name="bleu")
7361+
error_response_json = {
7362+
"error": {
7363+
"code": 429,
7364+
"message": "Resource exhausted.",
7365+
"status": "RESOURCE_EXHAUSTED",
7366+
}
7367+
}
7368+
mock_bleu_result = mock.MagicMock()
7369+
mock_bleu_result.model_dump.return_value = {
7370+
"bleu_results": {"bleu_metric_values": [{"score": 0.85}]}
7371+
}
7372+
mock_evaluate_instances.side_effect = [
7373+
genai_errors.ClientError(code=429, response_json=error_response_json),
7374+
genai_errors.ClientError(code=429, response_json=error_response_json),
7375+
mock_bleu_result,
7376+
]
7377+
7378+
result = _evals_common._execute_evaluation(
7379+
api_client=mock_api_client_fixture,
7380+
dataset=input_dataset,
7381+
metrics=[metric],
7382+
)
7383+
7384+
assert mock_evaluate_instances.call_count == 3
7385+
assert mock_sleep.call_count == 2
7386+
summary_metric = result.summary_metrics[0]
7387+
assert summary_metric.metric_name == "bleu"
7388+
assert summary_metric.mean_score == 0.85

0 commit comments

Comments
 (0)