ci: update evalbench pipeline trigger, sync scorers, and bump gemini-cli

omkargaikwad23 · omkargaikwad23 · commit dacee17291e7 · 2026-05-05T05:42:58.000Z
- **Pipeline Trigger Alignment:** Updated `cloudbuild.yaml` to support the manual evaluation trigger label (`ci:run-evals`) for non-release branches and set the correct `RELEASE_VERSION` context.
- **Scorers Sync:** Added missing `skills_best_practices` and `skills_trajectory` evaluation scorers to `evals/run_config.yaml`.
- **Tool Updates:** Bumped `gemini_cli_version` to `@google/gemini-cli@latest` and enabled `GEMINI_CLI_TRUST_WORKSPACE: "true"` environment variable to ensure secure execution workspace trust in the automated sandbox environment.
- **Repository Labels:** Appended the `ci:run-evals` label definition to `.github/labels.yaml`.
diff --git a/.github/labels.yaml b/.github/labels.yaml
@@ -83,4 +83,8 @@
 
 - name: 'release-please:force-run'
   color: bdca82
-  description: Manually trigger the release please workflow on a PR.
+  description: Manually trigger the release please workflow on a PR.
+
+- name: 'ci:run-evals'
+  color: 4285f4
+  description: Manually trigger the evaluation CI pipeline on a PR.
diff --git a/cloudbuild.yaml b/cloudbuild.yaml
@@ -27,12 +27,7 @@ steps:
       - |
         set -e
 
-        # Only run on release branches
-        if [[ "$_HEAD_BRANCH" != release-please-* ]]; then
-          echo "Not a release-please branch. Exiting."
-          exit 0
-        fi
-        echo "Release branch detected. Fetching PR data from GitHub API..."
+        echo "Fetching PR data from GitHub API..."
 
         # Fetch PR data and status code
         HTTP_STATUS=$(curl -s -o pr_data.json -w "%{http_code}" -H "Authorization: token $$GITHUB_TOKEN" \
@@ -50,15 +45,22 @@ steps:
         PR_LABELS=$(echo "$$PR_DATA" | jq -r '[.labels[].name] | join(",")')
         PR_TITLE=$(echo "$$PR_DATA" | jq -r '.title')
 
-        # Determine Release Version (Use double quotes and $$ for bash variables)
-        if [[ "$$PR_LABELS" == *"autorelease: pending"* ]]; then
+        # Check if execution labels are present
+        if [[ "$$PR_LABELS" != *"autorelease: pending"* && "$$PR_LABELS" != *"ci:run-evals"* ]]; then
+          echo "PR does not have 'autorelease: pending' or 'ci:run-evals' label. Skipping execution."
+          exit 0
+        fi
+        echo "Execution label detected. Processing release version context..."
+
+        # Determine Release Version based on branch name
+        if [[ "$_HEAD_BRANCH" == release-please-* ]]; then
           if [[ "$$PR_TITLE" =~ release\ ([0-9]+\.[0-9]+\.[0-9]+) ]]; then
             export RELEASE_VERSION="$${BASH_REMATCH[1]}"
           else
-            export RELEASE_VERSION="unknown"
+            export RELEASE_VERSION="pr-$_PR_NUMBER-release-unknown"
           fi
         else
-          export RELEASE_VERSION="unknown"
+          export RELEASE_VERSION="pr-$_PR_NUMBER-ci-run-evals"
         fi
 
         # Workaround for evalbench bug: settings are only applied if path basename matches extension ID
diff --git a/evals/model_config.yaml b/evals/model_config.yaml
@@ -12,12 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-gemini_cli_version: "@google/gemini-cli@0.38.1"
+gemini_cli_version: "@google/gemini-cli@latest"
 generator: gemini_cli
 env:
   GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}"
   GOOGLE_CLOUD_LOCATION: "global"
   GOOGLE_GENAI_USE_VERTEXAI: "true"
+  GEMINI_CLI_TRUST_WORKSPACE: "true"
 setup:
   extensions:
     # Points to the symlink created in cloudbuild.yaml to match the extension ID
diff --git a/evals/run_config.yaml b/evals/run_config.yaml
@@ -25,12 +25,16 @@ scorers:
     model_config: /workspace/evals/gemini_2.5_pro_model.yaml
   behavioral_metrics:
     model_config: /workspace/evals/gemini_2.5_pro_model.yaml
+  skills_best_practices:
+    model_config: /workspace/evals/gemini_2.5_pro_model.yaml
+    skills_dir: /workspace/bigquery-data-analytics/skills
 
   # Performance
   turn_count: {}
   end_to_end_latency: {}
   tool_call_latency: {}
   token_consumption: {}
+  skills_trajectory: {}
 
 reporting:
   bigquery: