Skip to content

Commit 69c0c82

Browse files
ci: add ci:run-evals label support, missing scorers and parameterize evaluation reporting project for flexible CI execution (#168)
1 parent d3d6339 commit 69c0c82

6 files changed

Lines changed: 40 additions & 24 deletions

File tree

.github/labels.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,4 +83,8 @@
8383

8484
- name: 'release-please:force-run'
8585
color: bdca82
86-
description: Manually trigger the release please workflow on a PR.
86+
description: Manually trigger the release please workflow on a PR.
87+
88+
- name: 'ci:run-evals'
89+
color: 4285f4
90+
description: Manually trigger the evaluation CI pipeline on a PR.

cloudbuild.yaml

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,7 @@ steps:
2727
- |
2828
set -e
2929
30-
# Only run on release branches
31-
if [[ "$_HEAD_BRANCH" != release-please-* ]]; then
32-
echo "Not a release-please branch. Exiting."
33-
exit 0
34-
fi
35-
echo "Release branch detected. Fetching PR data from GitHub API..."
30+
echo "Fetching PR data from GitHub API..."
3631
3732
# Fetch PR data and status code
3833
HTTP_STATUS=$(curl -s -o pr_data.json -w "%{http_code}" -H "Authorization: token $$GITHUB_TOKEN" \
@@ -46,28 +41,39 @@ steps:
4641
4742
PR_DATA=$(cat pr_data.json)
4843
49-
# Extract labels and title from PR data (Use $$ to escape bash variables)
50-
PR_LABELS=$(echo "$$PR_DATA" | jq -r '[.labels[].name] | join(",")')
44+
# Extract title from PR data (Use $$ to escape bash variables)
5145
PR_TITLE=$(echo "$$PR_DATA" | jq -r '.title')
5246
53-
# Determine Release Version (Use double quotes and $$ for bash variables)
54-
if [[ "$$PR_LABELS" == *"autorelease: pending"* ]]; then
47+
# Check if execution labels are present using exact matching via jq
48+
if ! jq -e '.labels | any(.name == "autorelease: pending" or .name == "ci:run-evals")' pr_data.json > /dev/null; then
49+
echo "PR does not have 'autorelease: pending' or 'ci:run-evals' label. Skipping execution."
50+
exit 0
51+
fi
52+
echo "Execution label detected. Processing release version context..."
53+
54+
# Determine Release Version based on branch name
55+
if [[ "$_HEAD_BRANCH" == release-please-* ]]; then
5556
if [[ "$$PR_TITLE" =~ release\ ([0-9]+\.[0-9]+\.[0-9]+) ]]; then
5657
export RELEASE_VERSION="$${BASH_REMATCH[1]}"
5758
else
58-
export RELEASE_VERSION="unknown"
59+
export RELEASE_VERSION="pr-$_PR_NUMBER-release-unknown"
5960
fi
6061
else
61-
export RELEASE_VERSION="unknown"
62+
export RELEASE_VERSION="pr-$_PR_NUMBER-ci-run-evals"
6263
fi
6364
6465
# Workaround for evalbench bug: settings are only applied if path basename matches extension ID
6566
ln -s /workspace /workspace/cloud-sql-postgresql
6667
cd /evalbench
6768
69+
# Set environment variables for evalbench
6870
export EVAL_GCP_PROJECT_ID=$PROJECT_ID
6971
export EVAL_GCP_PROJECT_REGION=$_CLOUD_SQL_REGION
7072
export GOOGLE_CLOUD_PROJECT=$PROJECT_ID
73+
export EVAL_REPORTING_PROJECT=$_EVAL_REPORTING_PROJECT
74+
75+
76+
# Set environment variables for extension
7177
export CLOUD_SQL_POSTGRES_PROJECT=$PROJECT_ID
7278
export CLOUD_SQL_POSTGRES_INSTANCE=$_CLOUD_SQL_INSTANCE
7379
export CLOUD_SQL_POSTGRES_REGION=$_CLOUD_SQL_REGION

evals/dataset.json

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,14 @@
22
"scenarios": [
33
{
44
"id": "cloud-sql-debug-instance",
5-
"starting_prompt": "Check on my databases in project ext-test-cloud-sql-postgres.",
6-
"conversation_plan": "Ask the agent to list all Cloud SQL instances in the project. Once all instances are listed, if 'daily-ci-evals-db' exists, get its details and validate it is RUNNABLE.",
5+
"starting_prompt": "Check on my databases in project ${GOOGLE_CLOUD_PROJECT}.",
6+
"conversation_plan": "Ask the agent to list all Cloud SQL instances in the project. Once all instances are listed, if '${CLOUD_SQL_POSTGRES_INSTANCE}' exists, get its details and validate it is RUNNABLE.",
77
"expected_trajectory": [
88
"list_instances",
99
"get_instance"
1010
],
1111
"env": {
12-
"GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres"
12+
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
1313
},
1414
"kind": "tools",
1515
"max_turns": 3
@@ -23,7 +23,7 @@
2323
"list_tables"
2424
],
2525
"env": {
26-
"GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres"
26+
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
2727
},
2828
"kind": "tools",
2929
"max_turns": 3
@@ -37,21 +37,21 @@
3737
"list_locks"
3838
],
3939
"env": {
40-
"GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres"
40+
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
4141
},
4242
"kind": "tools",
4343
"max_turns": 3
4444
},
4545
{
4646
"id": "cloud-sql-metrics-cpu-investigation",
47-
"starting_prompt": "I'm worried about the database load for daily-ci-evals-db.",
48-
"conversation_plan": "First, ask the agent to check the CPU utilization for the instance 'daily-ci-evals-db' for the last 5 minutes. After the agent provides the CPU data, ask it to check the overall database stats to see connection counts or transaction volume.",
47+
"starting_prompt": "I'm worried about the database load for ${CLOUD_SQL_POSTGRES_INSTANCE}.",
48+
"conversation_plan": "First, ask the agent to check the CPU utilization for the instance '${CLOUD_SQL_POSTGRES_INSTANCE}' for the last 5 minutes. After the agent provides the CPU data, ask it to check the overall database stats to see connection counts or transaction volume.",
4949
"expected_trajectory": [
5050
"get_system_metrics",
5151
"list_database_stats"
5252
],
5353
"env": {
54-
"GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres"
54+
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
5555
},
5656
"kind": "tools",
5757
"max_turns": 3

evals/model_config.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,13 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
gemini_cli_version: "@google/gemini-cli@0.38.1"
15+
gemini_cli_version: "@google/gemini-cli@latest"
1616
generator: gemini_cli
1717
env:
1818
GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}"
1919
GOOGLE_CLOUD_LOCATION: "global"
2020
GOOGLE_GENAI_USE_VERTEXAI: "true"
21+
GEMINI_CLI_TRUST_WORKSPACE: "true"
2122
setup:
2223
extensions:
2324
# Points to the symlink created in cloudbuild.yaml to match the extension ID

evals/run_config.yaml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,18 @@ scorers:
2525
model_config: /workspace/evals/gemini_2.5_pro_model.yaml
2626
behavioral_metrics:
2727
model_config: /workspace/evals/gemini_2.5_pro_model.yaml
28+
skills_best_practices:
29+
model_config: /workspace/evals/gemini_2.5_pro_model.yaml
30+
skills_dir: /workspace/cloud-sql-postgresql/skills
2831

2932
# Performance
3033
turn_count: {}
3134
end_to_end_latency: {}
3235
tool_call_latency: {}
3336
token_consumption: {}
37+
skills_trajectory: {}
3438

3539
reporting:
3640
bigquery:
37-
gcp_project_id: cloud-db-nl2sql
41+
gcp_project_id: "${EVAL_REPORTING_PROJECT}"
42+

evals/substitute_env.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import re
33

44
def main():
5-
yaml_paths = ['/workspace/evals/model_config.yaml', '/workspace/evals/run_config.yaml']
5+
yaml_paths = ['/workspace/evals/model_config.yaml', '/workspace/evals/run_config.yaml', '/workspace/evals/dataset.json']
66
for yaml_path in yaml_paths:
77
if os.path.exists(yaml_path):
88
with open(yaml_path, 'r') as f:

0 commit comments

Comments
 (0)