From dacee17291e7be35c35fa1b6d3616ec4eeeed83a Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Tue, 5 May 2026 05:42:58 +0000
Subject: [PATCH 1/5] ci: update evalbench pipeline trigger, sync scorers, and
 bump gemini-cli

- **Pipeline Trigger Alignment:** Updated `cloudbuild.yaml` to support the manual evaluation trigger label (`ci:run-evals`) for non-release branches and set the correct `RELEASE_VERSION` context.
- **Scorers Sync:** Added missing `skills_best_practices` and `skills_trajectory` evaluation scorers to `evals/run_config.yaml`.
- **Tool Updates:** Bumped `gemini_cli_version` to `@google/gemini-cli@latest` and enabled `GEMINI_CLI_TRUST_WORKSPACE: "true"` environment variable to ensure secure execution workspace trust in the automated sandbox environment.
- **Repository Labels:** Appended the `ci:run-evals` label definition to `.github/labels.yaml`.
---
 .github/labels.yaml     |  6 +++++-
 cloudbuild.yaml         | 22 ++++++++++++----------
 evals/model_config.yaml |  3 ++-
 evals/run_config.yaml   |  4 ++++
 4 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/.github/labels.yaml b/.github/labels.yaml
index 5974f23..abf2d71 100644
--- a/.github/labels.yaml
+++ b/.github/labels.yaml
@@ -83,4 +83,8 @@
 
 - name: 'release-please:force-run'
   color: bdca82
-  description: Manually trigger the release please workflow on a PR.
\ No newline at end of file
+  description: Manually trigger the release please workflow on a PR.
+
+- name: 'ci:run-evals'
+  color: 4285f4
+  description: Manually trigger the evaluation CI pipeline on a PR.
\ No newline at end of file
diff --git a/cloudbuild.yaml b/cloudbuild.yaml
index aadff4e..c750b2c 100644
--- a/cloudbuild.yaml
+++ b/cloudbuild.yaml
@@ -27,12 +27,7 @@ steps:
       - |
         set -e
 
-        # Only run on release branches
-        if [[ "$_HEAD_BRANCH" != release-please-* ]]; then
-          echo "Not a release-please branch. Exiting."
-          exit 0
-        fi
-        echo "Release branch detected. Fetching PR data from GitHub API..."
+        echo "Fetching PR data from GitHub API..."
 
         # Fetch PR data and status code
         HTTP_STATUS=$(curl -s -o pr_data.json -w "%{http_code}" -H "Authorization: token $$GITHUB_TOKEN" \
@@ -50,15 +45,22 @@ steps:
         PR_LABELS=$(echo "$$PR_DATA" | jq -r '[.labels[].name] | join(",")')
         PR_TITLE=$(echo "$$PR_DATA" | jq -r '.title')
 
-        # Determine Release Version (Use double quotes and $$ for bash variables)
-        if [[ "$$PR_LABELS" == *"autorelease: pending"* ]]; then
+        # Check if execution labels are present
+        if [[ "$$PR_LABELS" != *"autorelease: pending"* && "$$PR_LABELS" != *"ci:run-evals"* ]]; then
+          echo "PR does not have 'autorelease: pending' or 'ci:run-evals' label. Skipping execution."
+          exit 0
+        fi
+        echo "Execution label detected. Processing release version context..."
+
+        # Determine Release Version based on branch name
+        if [[ "$_HEAD_BRANCH" == release-please-* ]]; then
           if [[ "$$PR_TITLE" =~ release\ ([0-9]+\.[0-9]+\.[0-9]+) ]]; then
             export RELEASE_VERSION="$${BASH_REMATCH[1]}"
           else
-            export RELEASE_VERSION="unknown"
+            export RELEASE_VERSION="pr-$_PR_NUMBER-release-unknown"
           fi
         else
-          export RELEASE_VERSION="unknown"
+          export RELEASE_VERSION="pr-$_PR_NUMBER-ci-run-evals"
         fi
 
         # Workaround for evalbench bug: settings are only applied if path basename matches extension ID
diff --git a/evals/model_config.yaml b/evals/model_config.yaml
index 6f99dc1..29bf6b4 100644
--- a/evals/model_config.yaml
+++ b/evals/model_config.yaml
@@ -12,12 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-gemini_cli_version: "@google/gemini-cli@0.38.1"
+gemini_cli_version: "@google/gemini-cli@latest"
 generator: gemini_cli
 env:
   GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}"
   GOOGLE_CLOUD_LOCATION: "global"
   GOOGLE_GENAI_USE_VERTEXAI: "true"
+  GEMINI_CLI_TRUST_WORKSPACE: "true"
 setup:
   extensions:
     # Points to the symlink created in cloudbuild.yaml to match the extension ID
diff --git a/evals/run_config.yaml b/evals/run_config.yaml
index 0f45e6e..47d50b9 100644
--- a/evals/run_config.yaml
+++ b/evals/run_config.yaml
@@ -25,12 +25,16 @@ scorers:
     model_config: /workspace/evals/gemini_2.5_pro_model.yaml
   behavioral_metrics:
     model_config: /workspace/evals/gemini_2.5_pro_model.yaml
+  skills_best_practices:
+    model_config: /workspace/evals/gemini_2.5_pro_model.yaml
+    skills_dir: /workspace/bigquery-data-analytics/skills
 
   # Performance
   turn_count: {}
   end_to_end_latency: {}
   tool_call_latency: {}
   token_consumption: {}
+  skills_trajectory: {}
 
 reporting:
   bigquery:

From 2cf40be95cdf3e0b7b5cb389c0cbc78f26f6a59f Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Tue, 5 May 2026 05:54:30 +0000
Subject: [PATCH 2/5] ci: replace string-based label checking with robust jq
 boolean evaluation in Cloud Build pipeline

---
 cloudbuild.yaml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/cloudbuild.yaml b/cloudbuild.yaml
index c750b2c..3b5542d 100644
--- a/cloudbuild.yaml
+++ b/cloudbuild.yaml
@@ -41,12 +41,11 @@ steps:
 
         PR_DATA=$(cat pr_data.json)
 
-        # Extract labels and title from PR data (Use $$ to escape bash variables)
-        PR_LABELS=$(echo "$$PR_DATA" | jq -r '[.labels[].name] | join(",")')
+        # Extract title from PR data (Use $$ to escape bash variables)
         PR_TITLE=$(echo "$$PR_DATA" | jq -r '.title')
 
-        # Check if execution labels are present
-        if [[ "$$PR_LABELS" != *"autorelease: pending"* && "$$PR_LABELS" != *"ci:run-evals"* ]]; then
+        # Check if execution labels are present using exact matching via jq
+        if ! echo "$$PR_DATA" | jq -e '.labels | any(.name == "autorelease: pending" or .name == "ci:run-evals")' > /dev/null; then
           echo "PR does not have 'autorelease: pending' or 'ci:run-evals' label. Skipping execution."
           exit 0
         fi

From d8dc10ee58492918d960c7d9fb1fce3ca218643b Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Tue, 5 May 2026 07:00:35 +0000
Subject: [PATCH 3/5] refactor: update label check in cloudbuild to read from
 pr_data.json instead of piped input

---
 cloudbuild.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cloudbuild.yaml b/cloudbuild.yaml
index 3b5542d..433a9c4 100644
--- a/cloudbuild.yaml
+++ b/cloudbuild.yaml
@@ -45,7 +45,7 @@ steps:
         PR_TITLE=$(echo "$$PR_DATA" | jq -r '.title')
 
         # Check if execution labels are present using exact matching via jq
-        if ! echo "$$PR_DATA" | jq -e '.labels | any(.name == "autorelease: pending" or .name == "ci:run-evals")' > /dev/null; then
+        if ! jq -e '.labels | any(.name == "autorelease: pending" or .name == "ci:run-evals")' pr_data.json > /dev/null; then
           echo "PR does not have 'autorelease: pending' or 'ci:run-evals' label. Skipping execution."
           exit 0
         fi

From 8810634e58303f1537e4f6a355cb8cc303aec55f Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Tue, 5 May 2026 07:11:09 +0000
Subject: [PATCH 4/5] feat: parameterize evaluation reporting project ID using
 environment variable

---
 cloudbuild.yaml       | 2 ++
 evals/run_config.yaml | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/cloudbuild.yaml b/cloudbuild.yaml
index 433a9c4..2c8c985 100644
--- a/cloudbuild.yaml
+++ b/cloudbuild.yaml
@@ -69,6 +69,8 @@ steps:
         export EVAL_GCP_PROJECT_ID=$PROJECT_ID
         export GOOGLE_CLOUD_PROJECT=$PROJECT_ID
         export EVAL_GCP_PROJECT_REGION=$_EVAL_REGION
+        export EVAL_REPORTING_PROJECT=$_EVAL_REPORTING_PROJECT
+
 
         # Combine CI metadata with run config
         cat /workspace/evals/ci_metadata.yaml >> /workspace/evals/run_config.yaml
diff --git a/evals/run_config.yaml b/evals/run_config.yaml
index 47d50b9..f9fb34b 100644
--- a/evals/run_config.yaml
+++ b/evals/run_config.yaml
@@ -38,4 +38,5 @@ scorers:
 
 reporting:
   bigquery:
-    gcp_project_id: cloud-db-nl2sql
+    gcp_project_id: "${EVAL_REPORTING_PROJECT}"
+

From 0dcf726ac50a801800b780a35c0c64aa1f4e242a Mon Sep 17 00:00:00 2001
From: Omkar Gaikwad <omkargaikwad@google.com>
Date: Tue, 5 May 2026 07:25:53 +0000
Subject: [PATCH 5/5] chore: parameterize project ID in dataset and add JSON
 file to environment substitution script

---
 evals/dataset.json      | 8 ++++----
 evals/substitute_env.py | 3 ++-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/evals/dataset.json b/evals/dataset.json
index ea7327a..3fe0879 100644
--- a/evals/dataset.json
+++ b/evals/dataset.json
@@ -2,28 +2,28 @@
   "scenarios": [
     {
       "id": "bq-search-and-insight",
-      "starting_prompt": "Find tables related to sales in project ext-test-bigquery-analytics.",
+      "starting_prompt": "Find tables related to sales in project ${GOOGLE_CLOUD_PROJECT}.",
       "conversation_plan": "First, ask the agent to find tables related to sales. Once it lists the tables (which should include 'sales_data' in 'evalbench_ci'), ask it to identify the top product by sales in that table.",
       "expected_trajectory": [
         "search_catalog",
         "ask_data_insights"
       ],
       "env": {
-        "GOOGLE_CLOUD_PROJECT": "ext-test-bigquery-analytics"
+        "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
       },
       "kind": "tools",
       "max_turns": 4
     },
     {
       "id": "bq-insight-and-forecast",
-      "starting_prompt": "What are the top products by sales in the table 'sales_data' in dataset 'evalbench_ci' and project 'ext-test-bigquery-analytics'?",
+      "starting_prompt": "What are the top products by sales in the table 'sales_data' in dataset 'evalbench_ci' and project '${GOOGLE_CLOUD_PROJECT}'?",
       "conversation_plan": "First, ask the agent to find the top products by sales in the sales_data table. After it identifies the top products, ask it to forecast the sales for the top product for the next 5 steps.",
       "expected_trajectory": [
         "ask_data_insights",
         "forecast"
       ],
       "env": {
-        "GOOGLE_CLOUD_PROJECT": "ext-test-bigquery-analytics"
+        "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
       },
       "kind": "tools",
       "max_turns": 4
diff --git a/evals/substitute_env.py b/evals/substitute_env.py
index ded7c37..4ec23f6 100644
--- a/evals/substitute_env.py
+++ b/evals/substitute_env.py
@@ -6,7 +6,8 @@ def main():
     workspace = os.environ.get('EVAL_WORKSPACE', '/workspace')
     yaml_paths = [
         os.path.join(workspace, 'evals/model_config.yaml'),
-        os.path.join(workspace, 'evals/run_config.yaml')
+        os.path.join(workspace, 'evals/run_config.yaml'),
+        os.path.join(workspace, 'evals/dataset.json')
     ]
     
     for yaml_path in yaml_paths: