From 04be7522382a9ac8f4142914cf762d81e5d762a6 Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Mon, 4 May 2026 10:35:21 +0000 Subject: [PATCH 1/3] ci: add automated evaluation pipeline with environment variable substitution for Cloud SQL SQL Server --- cloudbuild.yaml | 106 ++++++++++++++++++++++++++++++++ evals/ci_metadata.yaml | 22 +++++++ evals/dataset.json | 45 ++++++++++++++ evals/gemini_2.5_pro_model.yaml | 18 ++++++ evals/model_config.yaml | 32 ++++++++++ evals/run_config.yaml | 41 ++++++++++++ evals/substitute_env.py | 18 ++++++ 7 files changed, 282 insertions(+) create mode 100644 cloudbuild.yaml create mode 100644 evals/ci_metadata.yaml create mode 100644 evals/dataset.json create mode 100644 evals/gemini_2.5_pro_model.yaml create mode 100644 evals/model_config.yaml create mode 100644 evals/run_config.yaml create mode 100644 evals/substitute_env.py diff --git a/cloudbuild.yaml b/cloudbuild.yaml new file mode 100644 index 0000000..3e620cd --- /dev/null +++ b/cloudbuild.yaml @@ -0,0 +1,106 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +options: + logging: CLOUD_LOGGING_ONLY + +steps: + + # --- Evaluation Step --- + - name: 'us-central1-docker.pkg.dev/cloud-db-nl2sql/evalbench/eval_server:latest' + entrypoint: 'bash' + # Decrypts the secret from Secret Manager into the MSSQL_DB_PASSWORD environment variable + secretEnv: ['MSSQL_DB_PASSWORD', 'GITHUB_TOKEN'] + args: + - '-c' + - | + set -e + + echo "Fetching PR data from GitHub API..." + + # Fetch PR data and status code + HTTP_STATUS=$(curl -s -o pr_data.json -w "%{http_code}" -H "Authorization: token $$GITHUB_TOKEN" \ + "https://api.github.com/repos/$REPO_FULL_NAME/pulls/$_PR_NUMBER") + + if [ "$$HTTP_STATUS" -ne 200 ]; then + echo "Error fetching PR data: HTTP $$HTTP_STATUS" + cat pr_data.json + exit 1 + fi + + PR_DATA=$(cat pr_data.json) + + # Extract labels and title from PR data (Use $$ to escape bash variables) + PR_LABELS=$(echo "$$PR_DATA" | jq -r '[.labels[].name] | join(",")') + PR_TITLE=$(echo "$$PR_DATA" | jq -r '.title') + + # Check if execution labels are present + if [[ "$$PR_LABELS" != *"autorelease: pending"* && "$$PR_LABELS" != *"ci:run-evals"* ]]; then + echo "PR does not have 'autorelease: pending' or 'ci:run-evals' label. Skipping execution." + exit 0 + fi + echo "Execution label detected. Processing release version context..." + + # Determine Release Version based on branch name + if [[ "$_HEAD_BRANCH" == release-please-* ]]; then + if [[ "$$PR_TITLE" =~ release\ ([0-9]+\.[0-9]+\.[0-9]+) ]]; then + export RELEASE_VERSION="$${BASH_REMATCH[1]}" + else + export RELEASE_VERSION="pr-$_PR_NUMBER-release-unknown" + fi + else + export RELEASE_VERSION="pr-$_PR_NUMBER-ci-run-evals" + fi + + # Workaround for evalbench bug: settings are only applied if path basename matches extension ID + ln -s /workspace /workspace/cloud-sql-sqlserver + cd /evalbench + + # evalbench specific environment variables + export EVAL_GCP_PROJECT_ID=$PROJECT_ID + export EVAL_GCP_PROJECT_REGION=$_CLOUD_SQL_REGION + export EVAL_REPORTING_PROJECT=$_EVAL_REPORTING_PROJECT + export GOOGLE_CLOUD_PROJECT=$PROJECT_ID + + # Cloud SQL SQL Server specific environment variables + export CLOUD_SQL_MSSQL_PROJECT=$PROJECT_ID + export CLOUD_SQL_MSSQL_INSTANCE=$_CLOUD_SQL_INSTANCE + export CLOUD_SQL_MSSQL_REGION=$_CLOUD_SQL_REGION + export CLOUD_SQL_MSSQL_DATABASE=$_CLOUD_SQL_DATABASE + export CLOUD_SQL_MSSQL_USER=$_CLOUD_SQL_USER + export CLOUD_SQL_MSSQL_IP_TYPE=$_CLOUD_SQL_IP_TYPE + + # Maps the decrypted MSSQL_DB_PASSWORD to the exact variable expected by gemini_cli and extension skills + export CLOUD_SQL_MSSQL_PASSWORD=$$MSSQL_DB_PASSWORD + + # Combine CI metadata with run config + cat /workspace/evals/ci_metadata.yaml >> /workspace/evals/run_config.yaml + + # Substitute environment variables in model_config.yaml + python3 /workspace/evals/substitute_env.py + + cd /evalbench + export PYTHONPATH=./evalbench:./evalbench/evalproto + export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python + + echo "Launching Standalone Evaluation..." + python3 evalbench/evalbench.py --experiment_config=/workspace/evals/run_config.yaml + + +availableSecrets: + secretManager: + - versionName: projects/$PROJECT_ID/secrets/MSSQL_DB_PASSWORD/versions/latest + env: 'MSSQL_DB_PASSWORD' + - versionName: projects/$PROJECT_ID/secrets/GITHUB_TOKEN/versions/latest + env: 'GITHUB_TOKEN' diff --git a/evals/ci_metadata.yaml b/evals/ci_metadata.yaml new file mode 100644 index 0000000..d72492f --- /dev/null +++ b/evals/ci_metadata.yaml @@ -0,0 +1,22 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +############################################################ +### CI Metadata (Repository Specific) +### Note: These fields are used for version tracking in BQ +### and are not part of the core Evalbench schema. +############################################################ + +extension_id: cloud-sql-sqlserver +release_version: ${RELEASE_VERSION} diff --git a/evals/dataset.json b/evals/dataset.json new file mode 100644 index 0000000..860a32b --- /dev/null +++ b/evals/dataset.json @@ -0,0 +1,45 @@ +{ + "scenarios": [ + { + "id": "cloud-sql-debug-instance", + "starting_prompt": "Check on my databases in project ${GOOGLE_CLOUD_PROJECT}.", + "conversation_plan": "Ask the agent to list all Cloud SQL instances in the project. Once all instances are listed, if '${CLOUD_SQL_MSSQL_INSTANCE}' exists, get its details and validate it is RUNNABLE.", + "expected_trajectory": [ + "list_instances", + "get_instance" + ], + "env": { + "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}" + }, + "kind": "tools", + "max_turns": 3 + }, + { + "id": "cloud-sql-schema-tables-explore", + "starting_prompt": "I want to understand the structure of my database.", + "conversation_plan": "First, ask the agent to list the databases in the instance. After the agent provides the databases, ask it to list the tables specifically for that database.", + "expected_trajectory": [ + "list_databases", + "list_tables" + ], + "env": { + "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}" + }, + "kind": "tools", + "max_turns": 3 + }, + { + "id": "cloud-sql-performance-check", + "starting_prompt": "Our database performance seems degraded.", + "conversation_plan": "Start by asking the agent to check the CPU utilization system metrics for the database instance to see if it's overloaded.", + "expected_trajectory": [ + "get_system_metrics" + ], + "env": { + "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}" + }, + "kind": "tools", + "max_turns": 3 + } + ] +} diff --git a/evals/gemini_2.5_pro_model.yaml b/evals/gemini_2.5_pro_model.yaml new file mode 100644 index 0000000..7154ec3 --- /dev/null +++ b/evals/gemini_2.5_pro_model.yaml @@ -0,0 +1,18 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +generator: gcp_vertex_gemini +vertex_model: gemini-2.5-pro +base_prompt: "" +execs_per_minute: 5 diff --git a/evals/model_config.yaml b/evals/model_config.yaml new file mode 100644 index 0000000..71ece22 --- /dev/null +++ b/evals/model_config.yaml @@ -0,0 +1,32 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +gemini_cli_version: "@google/gemini-cli@latest" +generator: gemini_cli +env: + GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}" + GOOGLE_CLOUD_LOCATION: "global" + GOOGLE_GENAI_USE_VERTEXAI: "true" +setup: + extensions: + # Points to the symlink created in cloudbuild.yaml to match the extension ID + "/workspace/cloud-sql-sqlserver": + settings: + CLOUD_SQL_MSSQL_PROJECT: "${CLOUD_SQL_MSSQL_PROJECT}" + CLOUD_SQL_MSSQL_INSTANCE: "${CLOUD_SQL_MSSQL_INSTANCE}" + CLOUD_SQL_MSSQL_REGION: "${CLOUD_SQL_MSSQL_REGION}" + CLOUD_SQL_MSSQL_DATABASE: "${CLOUD_SQL_MSSQL_DATABASE}" + CLOUD_SQL_MSSQL_USER: "${CLOUD_SQL_MSSQL_USER}" + CLOUD_SQL_MSSQL_PASSWORD: '${CLOUD_SQL_MSSQL_PASSWORD}' + CLOUD_SQL_MSSQL_IP_TYPE: "${CLOUD_SQL_MSSQL_IP_TYPE}" diff --git a/evals/run_config.yaml b/evals/run_config.yaml new file mode 100644 index 0000000..d732762 --- /dev/null +++ b/evals/run_config.yaml @@ -0,0 +1,41 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +dataset_config: /workspace/evals/dataset.json +dataset_format: gemini-cli-format + +orchestrator: geminicli +model_config: /workspace/evals/model_config.yaml +simulated_user_model_config: /workspace/evals/gemini_2.5_pro_model.yaml + +scorers: + # Qualitative (Judge-based) + goal_completion: + model_config: /workspace/evals/gemini_2.5_pro_model.yaml + behavioral_metrics: + model_config: /workspace/evals/gemini_2.5_pro_model.yaml + skills_best_practices: + model_config: /workspace/evals/gemini_2.5_pro_model.yaml + skills_dir: /workspace/cloud-sql-sqlserver/skills + + # Performance + turn_count: {} + end_to_end_latency: {} + tool_call_latency: {} + token_consumption: {} + skills_trajectory: {} + +reporting: + bigquery: + gcp_project_id: "${EVAL_REPORTING_PROJECT}" diff --git a/evals/substitute_env.py b/evals/substitute_env.py new file mode 100644 index 0000000..8505fc9 --- /dev/null +++ b/evals/substitute_env.py @@ -0,0 +1,18 @@ +import os +import re + +def main(): + yaml_paths = ['/workspace/evals/model_config.yaml', '/workspace/evals/run_config.yaml', '/workspace/evals/dataset.json'] + for yaml_path in yaml_paths: + if os.path.exists(yaml_path): + with open(yaml_path, 'r') as f: + content = f.read() + content = re.sub(r'\${(\w+)}', lambda m: os.environ.get(m.group(1), m.group(0)), content) + with open(yaml_path, 'w') as f: + f.write(content) + print(f"Successfully substituted environment variables in {yaml_path}") + else: + print(f"File not found: {yaml_path}") + +if __name__ == '__main__': + main() From 8bce4da67c38f8b5124f9fba3fe2c44d5b7b5dd3 Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Mon, 4 May 2026 10:40:10 +0000 Subject: [PATCH 2/3] ci: add ci:run-evals label for evaluation CI pipeline triggers --- .github/labels.yaml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/labels.yaml b/.github/labels.yaml index 5974f23..abf2d71 100644 --- a/.github/labels.yaml +++ b/.github/labels.yaml @@ -83,4 +83,8 @@ - name: 'release-please:force-run' color: bdca82 - description: Manually trigger the release please workflow on a PR. \ No newline at end of file + description: Manually trigger the release please workflow on a PR. + +- name: 'ci:run-evals' + color: 4285f4 + description: Manually trigger the evaluation CI pipeline on a PR. \ No newline at end of file From 2085dc40e28831598c0676d6525197d6b1493c1c Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Mon, 4 May 2026 10:50:22 +0000 Subject: [PATCH 3/3] feat: add GEMINI_CLI_TRUST_WORKSPACE configuration to model_config.yaml --- evals/model_config.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/evals/model_config.yaml b/evals/model_config.yaml index 71ece22..b7adf85 100644 --- a/evals/model_config.yaml +++ b/evals/model_config.yaml @@ -18,6 +18,7 @@ env: GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}" GOOGLE_CLOUD_LOCATION: "global" GOOGLE_GENAI_USE_VERTEXAI: "true" + GEMINI_CLI_TRUST_WORKSPACE: "true" setup: extensions: # Points to the symlink created in cloudbuild.yaml to match the extension ID