From cd3d9e29006dc07e9810f9b937101d550108fee2 Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Tue, 5 May 2026 07:12:05 +0000 Subject: [PATCH 1/3] feat: add ci:run-evals label support and parameterize evaluation reporting project for flexible CI execution --- .github/labels.yaml | 6 +++++- cloudbuild.yaml | 30 ++++++++++++++++++------------ evals/model_config.yaml | 3 ++- evals/run_config.yaml | 3 ++- 4 files changed, 27 insertions(+), 15 deletions(-) diff --git a/.github/labels.yaml b/.github/labels.yaml index 5974f23..abf2d71 100644 --- a/.github/labels.yaml +++ b/.github/labels.yaml @@ -83,4 +83,8 @@ - name: 'release-please:force-run' color: bdca82 - description: Manually trigger the release please workflow on a PR. \ No newline at end of file + description: Manually trigger the release please workflow on a PR. + +- name: 'ci:run-evals' + color: 4285f4 + description: Manually trigger the evaluation CI pipeline on a PR. \ No newline at end of file diff --git a/cloudbuild.yaml b/cloudbuild.yaml index d871aaa..921717b 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -27,12 +27,7 @@ steps: - | set -e - # Only run on release branches - if [[ "$_HEAD_BRANCH" != release-please-* ]]; then - echo "Not a release-please branch. Exiting." - exit 0 - fi - echo "Release branch detected. Fetching PR data from GitHub API..." + echo "Fetching PR data from GitHub API..." # Fetch PR data and status code HTTP_STATUS=$(curl -s -o pr_data.json -w "%{http_code}" -H "Authorization: token $$GITHUB_TOKEN" \ @@ -46,28 +41,39 @@ steps: PR_DATA=$(cat pr_data.json) - # Extract labels and title from PR data (Use $$ to escape bash variables) - PR_LABELS=$(echo "$$PR_DATA" | jq -r '[.labels[].name] | join(",")') + # Extract title from PR data (Use $$ to escape bash variables) PR_TITLE=$(echo "$$PR_DATA" | jq -r '.title') - # Determine Release Version (Use double quotes and $$ for bash variables) - if [[ "$$PR_LABELS" == *"autorelease: triggered"* ]]; then + # Check if execution labels are present using exact matching via jq + if ! jq -e '.labels | any(.name == "autorelease: pending" or .name == "ci:run-evals")' pr_data.json > /dev/null; then + echo "PR does not have 'autorelease: pending' or 'ci:run-evals' label. Skipping execution." + exit 0 + fi + echo "Execution label detected. Processing release version context..." + + # Determine Release Version based on branch name + if [[ "$_HEAD_BRANCH" == release-please-* ]]; then if [[ "$$PR_TITLE" =~ release\ ([0-9]+\.[0-9]+\.[0-9]+) ]]; then export RELEASE_VERSION="$${BASH_REMATCH[1]}" else - export RELEASE_VERSION="unknown" + export RELEASE_VERSION="pr-$_PR_NUMBER-release-unknown" fi else - export RELEASE_VERSION="unknown" + export RELEASE_VERSION="pr-$_PR_NUMBER-ci-run-evals" fi # Workaround for evalbench bug: settings are only applied if path basename matches extension ID ln -s /workspace /workspace/cloud-sql-postgresql cd /evalbench + # Set environment variables for evalbench export EVAL_GCP_PROJECT_ID=$PROJECT_ID export EVAL_GCP_PROJECT_REGION=$_CLOUD_SQL_REGION export GOOGLE_CLOUD_PROJECT=$PROJECT_ID + export EVAL_REPORTING_PROJECT=$_EVAL_REPORTING_PROJECT + + + # Set environment variables for extension export CLOUD_SQL_POSTGRES_PROJECT=$PROJECT_ID export CLOUD_SQL_POSTGRES_INSTANCE=$_CLOUD_SQL_INSTANCE export CLOUD_SQL_POSTGRES_REGION=$_CLOUD_SQL_REGION diff --git a/evals/model_config.yaml b/evals/model_config.yaml index 485c758..2973cb4 100644 --- a/evals/model_config.yaml +++ b/evals/model_config.yaml @@ -12,12 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -gemini_cli_version: "@google/gemini-cli@0.38.1" +gemini_cli_version: "@google/gemini-cli@latest" generator: gemini_cli env: GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}" GOOGLE_CLOUD_LOCATION: "global" GOOGLE_GENAI_USE_VERTEXAI: "true" + GEMINI_CLI_TRUST_WORKSPACE: "true" setup: extensions: # Points to the symlink created in cloudbuild.yaml to match the extension ID diff --git a/evals/run_config.yaml b/evals/run_config.yaml index 0f45e6e..4707ac7 100644 --- a/evals/run_config.yaml +++ b/evals/run_config.yaml @@ -34,4 +34,5 @@ scorers: reporting: bigquery: - gcp_project_id: cloud-db-nl2sql + gcp_project_id: "${EVAL_REPORTING_PROJECT}" + From 28db373a9472529fca883555b4e0bf21dc6f04a8 Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Tue, 5 May 2026 07:15:12 +0000 Subject: [PATCH 2/3] ci: add skills_best_practices configuration and skills_trajectory metric to run_config --- evals/run_config.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/evals/run_config.yaml b/evals/run_config.yaml index 4707ac7..600bddd 100644 --- a/evals/run_config.yaml +++ b/evals/run_config.yaml @@ -25,12 +25,16 @@ scorers: model_config: /workspace/evals/gemini_2.5_pro_model.yaml behavioral_metrics: model_config: /workspace/evals/gemini_2.5_pro_model.yaml + skills_best_practices: + model_config: /workspace/evals/gemini_2.5_pro_model.yaml + skills_dir: /workspace/cloud-sql-postgresql/skills # Performance turn_count: {} end_to_end_latency: {} tool_call_latency: {} token_consumption: {} + skills_trajectory: {} reporting: bigquery: From 5f9931a063c46661fa711c0918a780905ee655f8 Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Tue, 5 May 2026 07:27:16 +0000 Subject: [PATCH 3/3] ci: parameterize project and instance names in dataset and add to substitution script --- evals/dataset.json | 16 ++++++++-------- evals/substitute_env.py | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/evals/dataset.json b/evals/dataset.json index a42bbae..654015f 100644 --- a/evals/dataset.json +++ b/evals/dataset.json @@ -2,14 +2,14 @@ "scenarios": [ { "id": "cloud-sql-debug-instance", - "starting_prompt": "Check on my databases in project ext-test-cloud-sql-postgres.", - "conversation_plan": "Ask the agent to list all Cloud SQL instances in the project. Once all instances are listed, if 'daily-ci-evals-db' exists, get its details and validate it is RUNNABLE.", + "starting_prompt": "Check on my databases in project ${GOOGLE_CLOUD_PROJECT}.", + "conversation_plan": "Ask the agent to list all Cloud SQL instances in the project. Once all instances are listed, if '${CLOUD_SQL_POSTGRES_INSTANCE}' exists, get its details and validate it is RUNNABLE.", "expected_trajectory": [ "list_instances", "get_instance" ], "env": { - "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres" + "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}" }, "kind": "tools", "max_turns": 3 @@ -23,7 +23,7 @@ "list_tables" ], "env": { - "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres" + "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}" }, "kind": "tools", "max_turns": 3 @@ -37,21 +37,21 @@ "list_locks" ], "env": { - "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres" + "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}" }, "kind": "tools", "max_turns": 3 }, { "id": "cloud-sql-metrics-cpu-investigation", - "starting_prompt": "I'm worried about the database load for daily-ci-evals-db.", - "conversation_plan": "First, ask the agent to check the CPU utilization for the instance 'daily-ci-evals-db' for the last 5 minutes. After the agent provides the CPU data, ask it to check the overall database stats to see connection counts or transaction volume.", + "starting_prompt": "I'm worried about the database load for ${CLOUD_SQL_POSTGRES_INSTANCE}.", + "conversation_plan": "First, ask the agent to check the CPU utilization for the instance '${CLOUD_SQL_POSTGRES_INSTANCE}' for the last 5 minutes. After the agent provides the CPU data, ask it to check the overall database stats to see connection counts or transaction volume.", "expected_trajectory": [ "get_system_metrics", "list_database_stats" ], "env": { - "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres" + "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}" }, "kind": "tools", "max_turns": 3 diff --git a/evals/substitute_env.py b/evals/substitute_env.py index f10c8e3..cbe1a3a 100644 --- a/evals/substitute_env.py +++ b/evals/substitute_env.py @@ -2,7 +2,7 @@ import re def main(): - yaml_paths = ['/workspace/evals/model_config.yaml', '/workspace/evals/run_config.yaml'] + yaml_paths = ['/workspace/evals/model_config.yaml', '/workspace/evals/run_config.yaml', '/workspace/evals/dataset.json'] for yaml_path in yaml_paths: if os.path.exists(yaml_path): with open(yaml_path, 'r') as f: