From dacee17291e7be35c35fa1b6d3616ec4eeeed83a Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Tue, 5 May 2026 05:42:58 +0000 Subject: [PATCH 1/5] ci: update evalbench pipeline trigger, sync scorers, and bump gemini-cli - **Pipeline Trigger Alignment:** Updated `cloudbuild.yaml` to support the manual evaluation trigger label (`ci:run-evals`) for non-release branches and set the correct `RELEASE_VERSION` context. - **Scorers Sync:** Added missing `skills_best_practices` and `skills_trajectory` evaluation scorers to `evals/run_config.yaml`. - **Tool Updates:** Bumped `gemini_cli_version` to `@google/gemini-cli@latest` and enabled `GEMINI_CLI_TRUST_WORKSPACE: "true"` environment variable to ensure secure execution workspace trust in the automated sandbox environment. - **Repository Labels:** Appended the `ci:run-evals` label definition to `.github/labels.yaml`. --- .github/labels.yaml | 6 +++++- cloudbuild.yaml | 22 ++++++++++++---------- evals/model_config.yaml | 3 ++- evals/run_config.yaml | 4 ++++ 4 files changed, 23 insertions(+), 12 deletions(-) diff --git a/.github/labels.yaml b/.github/labels.yaml index 5974f23..abf2d71 100644 --- a/.github/labels.yaml +++ b/.github/labels.yaml @@ -83,4 +83,8 @@ - name: 'release-please:force-run' color: bdca82 - description: Manually trigger the release please workflow on a PR. \ No newline at end of file + description: Manually trigger the release please workflow on a PR. + +- name: 'ci:run-evals' + color: 4285f4 + description: Manually trigger the evaluation CI pipeline on a PR. \ No newline at end of file diff --git a/cloudbuild.yaml b/cloudbuild.yaml index aadff4e..c750b2c 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -27,12 +27,7 @@ steps: - | set -e - # Only run on release branches - if [[ "$_HEAD_BRANCH" != release-please-* ]]; then - echo "Not a release-please branch. Exiting." - exit 0 - fi - echo "Release branch detected. Fetching PR data from GitHub API..." + echo "Fetching PR data from GitHub API..." # Fetch PR data and status code HTTP_STATUS=$(curl -s -o pr_data.json -w "%{http_code}" -H "Authorization: token $$GITHUB_TOKEN" \ @@ -50,15 +45,22 @@ steps: PR_LABELS=$(echo "$$PR_DATA" | jq -r '[.labels[].name] | join(",")') PR_TITLE=$(echo "$$PR_DATA" | jq -r '.title') - # Determine Release Version (Use double quotes and $$ for bash variables) - if [[ "$$PR_LABELS" == *"autorelease: pending"* ]]; then + # Check if execution labels are present + if [[ "$$PR_LABELS" != *"autorelease: pending"* && "$$PR_LABELS" != *"ci:run-evals"* ]]; then + echo "PR does not have 'autorelease: pending' or 'ci:run-evals' label. Skipping execution." + exit 0 + fi + echo "Execution label detected. Processing release version context..." + + # Determine Release Version based on branch name + if [[ "$_HEAD_BRANCH" == release-please-* ]]; then if [[ "$$PR_TITLE" =~ release\ ([0-9]+\.[0-9]+\.[0-9]+) ]]; then export RELEASE_VERSION="$${BASH_REMATCH[1]}" else - export RELEASE_VERSION="unknown" + export RELEASE_VERSION="pr-$_PR_NUMBER-release-unknown" fi else - export RELEASE_VERSION="unknown" + export RELEASE_VERSION="pr-$_PR_NUMBER-ci-run-evals" fi # Workaround for evalbench bug: settings are only applied if path basename matches extension ID diff --git a/evals/model_config.yaml b/evals/model_config.yaml index 6f99dc1..29bf6b4 100644 --- a/evals/model_config.yaml +++ b/evals/model_config.yaml @@ -12,12 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -gemini_cli_version: "@google/gemini-cli@0.38.1" +gemini_cli_version: "@google/gemini-cli@latest" generator: gemini_cli env: GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}" GOOGLE_CLOUD_LOCATION: "global" GOOGLE_GENAI_USE_VERTEXAI: "true" + GEMINI_CLI_TRUST_WORKSPACE: "true" setup: extensions: # Points to the symlink created in cloudbuild.yaml to match the extension ID diff --git a/evals/run_config.yaml b/evals/run_config.yaml index 0f45e6e..47d50b9 100644 --- a/evals/run_config.yaml +++ b/evals/run_config.yaml @@ -25,12 +25,16 @@ scorers: model_config: /workspace/evals/gemini_2.5_pro_model.yaml behavioral_metrics: model_config: /workspace/evals/gemini_2.5_pro_model.yaml + skills_best_practices: + model_config: /workspace/evals/gemini_2.5_pro_model.yaml + skills_dir: /workspace/bigquery-data-analytics/skills # Performance turn_count: {} end_to_end_latency: {} tool_call_latency: {} token_consumption: {} + skills_trajectory: {} reporting: bigquery: From 2cf40be95cdf3e0b7b5cb389c0cbc78f26f6a59f Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Tue, 5 May 2026 05:54:30 +0000 Subject: [PATCH 2/5] ci: replace string-based label checking with robust jq boolean evaluation in Cloud Build pipeline --- cloudbuild.yaml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cloudbuild.yaml b/cloudbuild.yaml index c750b2c..3b5542d 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -41,12 +41,11 @@ steps: PR_DATA=$(cat pr_data.json) - # Extract labels and title from PR data (Use $$ to escape bash variables) - PR_LABELS=$(echo "$$PR_DATA" | jq -r '[.labels[].name] | join(",")') + # Extract title from PR data (Use $$ to escape bash variables) PR_TITLE=$(echo "$$PR_DATA" | jq -r '.title') - # Check if execution labels are present - if [[ "$$PR_LABELS" != *"autorelease: pending"* && "$$PR_LABELS" != *"ci:run-evals"* ]]; then + # Check if execution labels are present using exact matching via jq + if ! echo "$$PR_DATA" | jq -e '.labels | any(.name == "autorelease: pending" or .name == "ci:run-evals")' > /dev/null; then echo "PR does not have 'autorelease: pending' or 'ci:run-evals' label. Skipping execution." exit 0 fi From d8dc10ee58492918d960c7d9fb1fce3ca218643b Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Tue, 5 May 2026 07:00:35 +0000 Subject: [PATCH 3/5] refactor: update label check in cloudbuild to read from pr_data.json instead of piped input --- cloudbuild.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloudbuild.yaml b/cloudbuild.yaml index 3b5542d..433a9c4 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -45,7 +45,7 @@ steps: PR_TITLE=$(echo "$$PR_DATA" | jq -r '.title') # Check if execution labels are present using exact matching via jq - if ! echo "$$PR_DATA" | jq -e '.labels | any(.name == "autorelease: pending" or .name == "ci:run-evals")' > /dev/null; then + if ! jq -e '.labels | any(.name == "autorelease: pending" or .name == "ci:run-evals")' pr_data.json > /dev/null; then echo "PR does not have 'autorelease: pending' or 'ci:run-evals' label. Skipping execution." exit 0 fi From 8810634e58303f1537e4f6a355cb8cc303aec55f Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Tue, 5 May 2026 07:11:09 +0000 Subject: [PATCH 4/5] feat: parameterize evaluation reporting project ID using environment variable --- cloudbuild.yaml | 2 ++ evals/run_config.yaml | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/cloudbuild.yaml b/cloudbuild.yaml index 433a9c4..2c8c985 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -69,6 +69,8 @@ steps: export EVAL_GCP_PROJECT_ID=$PROJECT_ID export GOOGLE_CLOUD_PROJECT=$PROJECT_ID export EVAL_GCP_PROJECT_REGION=$_EVAL_REGION + export EVAL_REPORTING_PROJECT=$_EVAL_REPORTING_PROJECT + # Combine CI metadata with run config cat /workspace/evals/ci_metadata.yaml >> /workspace/evals/run_config.yaml diff --git a/evals/run_config.yaml b/evals/run_config.yaml index 47d50b9..f9fb34b 100644 --- a/evals/run_config.yaml +++ b/evals/run_config.yaml @@ -38,4 +38,5 @@ scorers: reporting: bigquery: - gcp_project_id: cloud-db-nl2sql + gcp_project_id: "${EVAL_REPORTING_PROJECT}" + From 0dcf726ac50a801800b780a35c0c64aa1f4e242a Mon Sep 17 00:00:00 2001 From: Omkar Gaikwad Date: Tue, 5 May 2026 07:25:53 +0000 Subject: [PATCH 5/5] chore: parameterize project ID in dataset and add JSON file to environment substitution script --- evals/dataset.json | 8 ++++---- evals/substitute_env.py | 3 ++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/evals/dataset.json b/evals/dataset.json index ea7327a..3fe0879 100644 --- a/evals/dataset.json +++ b/evals/dataset.json @@ -2,28 +2,28 @@ "scenarios": [ { "id": "bq-search-and-insight", - "starting_prompt": "Find tables related to sales in project ext-test-bigquery-analytics.", + "starting_prompt": "Find tables related to sales in project ${GOOGLE_CLOUD_PROJECT}.", "conversation_plan": "First, ask the agent to find tables related to sales. Once it lists the tables (which should include 'sales_data' in 'evalbench_ci'), ask it to identify the top product by sales in that table.", "expected_trajectory": [ "search_catalog", "ask_data_insights" ], "env": { - "GOOGLE_CLOUD_PROJECT": "ext-test-bigquery-analytics" + "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}" }, "kind": "tools", "max_turns": 4 }, { "id": "bq-insight-and-forecast", - "starting_prompt": "What are the top products by sales in the table 'sales_data' in dataset 'evalbench_ci' and project 'ext-test-bigquery-analytics'?", + "starting_prompt": "What are the top products by sales in the table 'sales_data' in dataset 'evalbench_ci' and project '${GOOGLE_CLOUD_PROJECT}'?", "conversation_plan": "First, ask the agent to find the top products by sales in the sales_data table. After it identifies the top products, ask it to forecast the sales for the top product for the next 5 steps.", "expected_trajectory": [ "ask_data_insights", "forecast" ], "env": { - "GOOGLE_CLOUD_PROJECT": "ext-test-bigquery-analytics" + "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}" }, "kind": "tools", "max_turns": 4 diff --git a/evals/substitute_env.py b/evals/substitute_env.py index ded7c37..4ec23f6 100644 --- a/evals/substitute_env.py +++ b/evals/substitute_env.py @@ -6,7 +6,8 @@ def main(): workspace = os.environ.get('EVAL_WORKSPACE', '/workspace') yaml_paths = [ os.path.join(workspace, 'evals/model_config.yaml'), - os.path.join(workspace, 'evals/run_config.yaml') + os.path.join(workspace, 'evals/run_config.yaml'), + os.path.join(workspace, 'evals/dataset.json') ] for yaml_path in yaml_paths: