ci: add ci:run-evals label support, missing scorers and parameterize evaluation reporting project for flexible CI execution (#168)

omkargaikwad23 · web-flow · commit 69c0c820513d · 2026-05-05T14:59:08.000+05:30
diff --git a/.github/labels.yaml b/.github/labels.yaml
@@ -83,4 +83,8 @@
 
 - name: 'release-please:force-run'
   color: bdca82
-  description: Manually trigger the release please workflow on a PR.
+  description: Manually trigger the release please workflow on a PR.
+
+- name: 'ci:run-evals'
+  color: 4285f4
+  description: Manually trigger the evaluation CI pipeline on a PR.
diff --git a/cloudbuild.yaml b/cloudbuild.yaml
@@ -27,12 +27,7 @@ steps:
       - |
         set -e
 
-        # Only run on release branches
-        if [[ "$_HEAD_BRANCH" != release-please-* ]]; then
-          echo "Not a release-please branch. Exiting."
-          exit 0
-        fi
-        echo "Release branch detected. Fetching PR data from GitHub API..."
+        echo "Fetching PR data from GitHub API..."
 
         # Fetch PR data and status code
         HTTP_STATUS=$(curl -s -o pr_data.json -w "%{http_code}" -H "Authorization: token $$GITHUB_TOKEN" \
@@ -46,28 +41,39 @@ steps:
 
         PR_DATA=$(cat pr_data.json)
 
-        # Extract labels and title from PR data (Use $$ to escape bash variables)
-        PR_LABELS=$(echo "$$PR_DATA" | jq -r '[.labels[].name] | join(",")')
+        # Extract title from PR data (Use $$ to escape bash variables)
         PR_TITLE=$(echo "$$PR_DATA" | jq -r '.title')
 
-        # Determine Release Version (Use double quotes and $$ for bash variables)
-        if [[ "$$PR_LABELS" == *"autorelease: pending"* ]]; then
+        # Check if execution labels are present using exact matching via jq
+        if ! jq -e '.labels | any(.name == "autorelease: pending" or .name == "ci:run-evals")' pr_data.json > /dev/null; then
+          echo "PR does not have 'autorelease: pending' or 'ci:run-evals' label. Skipping execution."
+          exit 0
+        fi
+        echo "Execution label detected. Processing release version context..."
+
+        # Determine Release Version based on branch name
+        if [[ "$_HEAD_BRANCH" == release-please-* ]]; then
           if [[ "$$PR_TITLE" =~ release\ ([0-9]+\.[0-9]+\.[0-9]+) ]]; then
             export RELEASE_VERSION="$${BASH_REMATCH[1]}"
           else
-            export RELEASE_VERSION="unknown"
+            export RELEASE_VERSION="pr-$_PR_NUMBER-release-unknown"
           fi
         else
-          export RELEASE_VERSION="unknown"
+          export RELEASE_VERSION="pr-$_PR_NUMBER-ci-run-evals"
         fi
 
         # Workaround for evalbench bug: settings are only applied if path basename matches extension ID
         ln -s /workspace /workspace/cloud-sql-postgresql
         cd /evalbench
 
+        # Set environment variables for evalbench
         export EVAL_GCP_PROJECT_ID=$PROJECT_ID
         export EVAL_GCP_PROJECT_REGION=$_CLOUD_SQL_REGION
         export GOOGLE_CLOUD_PROJECT=$PROJECT_ID
+        export EVAL_REPORTING_PROJECT=$_EVAL_REPORTING_PROJECT
+
+
+        # Set environment variables for extension
         export CLOUD_SQL_POSTGRES_PROJECT=$PROJECT_ID
         export CLOUD_SQL_POSTGRES_INSTANCE=$_CLOUD_SQL_INSTANCE
         export CLOUD_SQL_POSTGRES_REGION=$_CLOUD_SQL_REGION
diff --git a/evals/dataset.json b/evals/dataset.json
@@ -2,14 +2,14 @@
   "scenarios": [
     {
       "id": "cloud-sql-debug-instance",
-      "starting_prompt": "Check on my databases in project ext-test-cloud-sql-postgres.",
-      "conversation_plan": "Ask the agent to list all Cloud SQL instances in the project. Once all instances are listed, if 'daily-ci-evals-db' exists, get its details and validate it is RUNNABLE.",
+      "starting_prompt": "Check on my databases in project ${GOOGLE_CLOUD_PROJECT}.",
+      "conversation_plan": "Ask the agent to list all Cloud SQL instances in the project. Once all instances are listed, if '${CLOUD_SQL_POSTGRES_INSTANCE}' exists, get its details and validate it is RUNNABLE.",
       "expected_trajectory": [
         "list_instances",
         "get_instance"
       ],
       "env": {
-        "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres"
+        "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
       },
       "kind": "tools",
       "max_turns": 3
@@ -23,7 +23,7 @@
         "list_tables"
       ],
       "env": {
-        "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres"
+        "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
       },
       "kind": "tools",
       "max_turns": 3
@@ -37,21 +37,21 @@
         "list_locks"
       ],
       "env": {
-        "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres"
+        "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
       },
       "kind": "tools",
       "max_turns": 3
     },
     {
       "id": "cloud-sql-metrics-cpu-investigation",
-      "starting_prompt": "I'm worried about the database load for daily-ci-evals-db.",
-      "conversation_plan": "First, ask the agent to check the CPU utilization for the instance 'daily-ci-evals-db' for the last 5 minutes. After the agent provides the CPU data, ask it to check the overall database stats to see connection counts or transaction volume.",
+      "starting_prompt": "I'm worried about the database load for ${CLOUD_SQL_POSTGRES_INSTANCE}.",
+      "conversation_plan": "First, ask the agent to check the CPU utilization for the instance '${CLOUD_SQL_POSTGRES_INSTANCE}' for the last 5 minutes. After the agent provides the CPU data, ask it to check the overall database stats to see connection counts or transaction volume.",
       "expected_trajectory": [
         "get_system_metrics",
         "list_database_stats"
       ],
       "env": {
-        "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres"
+        "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
       },
       "kind": "tools",
       "max_turns": 3
diff --git a/evals/model_config.yaml b/evals/model_config.yaml
@@ -12,12 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-gemini_cli_version: "@google/gemini-cli@0.38.1"
+gemini_cli_version: "@google/gemini-cli@latest"
 generator: gemini_cli
 env:
   GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}"
   GOOGLE_CLOUD_LOCATION: "global"
   GOOGLE_GENAI_USE_VERTEXAI: "true"
+  GEMINI_CLI_TRUST_WORKSPACE: "true"
 setup:
   extensions:
     # Points to the symlink created in cloudbuild.yaml to match the extension ID
diff --git a/evals/run_config.yaml b/evals/run_config.yaml
@@ -25,13 +25,18 @@ scorers:
     model_config: /workspace/evals/gemini_2.5_pro_model.yaml
   behavioral_metrics:
     model_config: /workspace/evals/gemini_2.5_pro_model.yaml
+  skills_best_practices:
+    model_config: /workspace/evals/gemini_2.5_pro_model.yaml
+    skills_dir: /workspace/cloud-sql-postgresql/skills
 
   # Performance
   turn_count: {}
   end_to_end_latency: {}
   tool_call_latency: {}
   token_consumption: {}
+  skills_trajectory: {}
 
 reporting:
   bigquery:
-    gcp_project_id: cloud-db-nl2sql
+    gcp_project_id: "${EVAL_REPORTING_PROJECT}"
+
diff --git a/evals/substitute_env.py b/evals/substitute_env.py
@@ -2,7 +2,7 @@
 import re
 
 def main():
-    yaml_paths = ['/workspace/evals/model_config.yaml', '/workspace/evals/run_config.yaml']
+    yaml_paths = ['/workspace/evals/model_config.yaml', '/workspace/evals/run_config.yaml', '/workspace/evals/dataset.json']
     for yaml_path in yaml_paths:
         if os.path.exists(yaml_path):
             with open(yaml_path, 'r') as f: