diff --git a/.github/labels.yaml b/.github/labels.yaml index 5974f23..abf2d71 100644 --- a/.github/labels.yaml +++ b/.github/labels.yaml @@ -83,4 +83,8 @@ - name: 'release-please:force-run' color: bdca82 - description: Manually trigger the release please workflow on a PR. \ No newline at end of file + description: Manually trigger the release please workflow on a PR. + +- name: 'ci:run-evals' + color: 4285f4 + description: Manually trigger the evaluation CI pipeline on a PR. \ No newline at end of file diff --git a/cloudbuild.yaml b/cloudbuild.yaml index aadff4e..2c8c985 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -27,12 +27,7 @@ steps: - | set -e - # Only run on release branches - if [[ "$_HEAD_BRANCH" != release-please-* ]]; then - echo "Not a release-please branch. Exiting." - exit 0 - fi - echo "Release branch detected. Fetching PR data from GitHub API..." + echo "Fetching PR data from GitHub API..." # Fetch PR data and status code HTTP_STATUS=$(curl -s -o pr_data.json -w "%{http_code}" -H "Authorization: token $$GITHUB_TOKEN" \ @@ -46,19 +41,25 @@ steps: PR_DATA=$(cat pr_data.json) - # Extract labels and title from PR data (Use $$ to escape bash variables) - PR_LABELS=$(echo "$$PR_DATA" | jq -r '[.labels[].name] | join(",")') + # Extract title from PR data (Use $$ to escape bash variables) PR_TITLE=$(echo "$$PR_DATA" | jq -r '.title') - # Determine Release Version (Use double quotes and $$ for bash variables) - if [[ "$$PR_LABELS" == *"autorelease: pending"* ]]; then + # Check if execution labels are present using exact matching via jq + if ! jq -e '.labels | any(.name == "autorelease: pending" or .name == "ci:run-evals")' pr_data.json > /dev/null; then + echo "PR does not have 'autorelease: pending' or 'ci:run-evals' label. Skipping execution." + exit 0 + fi + echo "Execution label detected. Processing release version context..." + + # Determine Release Version based on branch name + if [[ "$_HEAD_BRANCH" == release-please-* ]]; then if [[ "$$PR_TITLE" =~ release\ ([0-9]+\.[0-9]+\.[0-9]+) ]]; then export RELEASE_VERSION="$${BASH_REMATCH[1]}" else - export RELEASE_VERSION="unknown" + export RELEASE_VERSION="pr-$_PR_NUMBER-release-unknown" fi else - export RELEASE_VERSION="unknown" + export RELEASE_VERSION="pr-$_PR_NUMBER-ci-run-evals" fi # Workaround for evalbench bug: settings are only applied if path basename matches extension ID @@ -68,6 +69,8 @@ steps: export EVAL_GCP_PROJECT_ID=$PROJECT_ID export GOOGLE_CLOUD_PROJECT=$PROJECT_ID export EVAL_GCP_PROJECT_REGION=$_EVAL_REGION + export EVAL_REPORTING_PROJECT=$_EVAL_REPORTING_PROJECT + # Combine CI metadata with run config cat /workspace/evals/ci_metadata.yaml >> /workspace/evals/run_config.yaml diff --git a/evals/dataset.json b/evals/dataset.json index ea7327a..3fe0879 100644 --- a/evals/dataset.json +++ b/evals/dataset.json @@ -2,28 +2,28 @@ "scenarios": [ { "id": "bq-search-and-insight", - "starting_prompt": "Find tables related to sales in project ext-test-bigquery-analytics.", + "starting_prompt": "Find tables related to sales in project ${GOOGLE_CLOUD_PROJECT}.", "conversation_plan": "First, ask the agent to find tables related to sales. Once it lists the tables (which should include 'sales_data' in 'evalbench_ci'), ask it to identify the top product by sales in that table.", "expected_trajectory": [ "search_catalog", "ask_data_insights" ], "env": { - "GOOGLE_CLOUD_PROJECT": "ext-test-bigquery-analytics" + "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}" }, "kind": "tools", "max_turns": 4 }, { "id": "bq-insight-and-forecast", - "starting_prompt": "What are the top products by sales in the table 'sales_data' in dataset 'evalbench_ci' and project 'ext-test-bigquery-analytics'?", + "starting_prompt": "What are the top products by sales in the table 'sales_data' in dataset 'evalbench_ci' and project '${GOOGLE_CLOUD_PROJECT}'?", "conversation_plan": "First, ask the agent to find the top products by sales in the sales_data table. After it identifies the top products, ask it to forecast the sales for the top product for the next 5 steps.", "expected_trajectory": [ "ask_data_insights", "forecast" ], "env": { - "GOOGLE_CLOUD_PROJECT": "ext-test-bigquery-analytics" + "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}" }, "kind": "tools", "max_turns": 4 diff --git a/evals/model_config.yaml b/evals/model_config.yaml index 6f99dc1..29bf6b4 100644 --- a/evals/model_config.yaml +++ b/evals/model_config.yaml @@ -12,12 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -gemini_cli_version: "@google/gemini-cli@0.38.1" +gemini_cli_version: "@google/gemini-cli@latest" generator: gemini_cli env: GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}" GOOGLE_CLOUD_LOCATION: "global" GOOGLE_GENAI_USE_VERTEXAI: "true" + GEMINI_CLI_TRUST_WORKSPACE: "true" setup: extensions: # Points to the symlink created in cloudbuild.yaml to match the extension ID diff --git a/evals/run_config.yaml b/evals/run_config.yaml index 0f45e6e..f9fb34b 100644 --- a/evals/run_config.yaml +++ b/evals/run_config.yaml @@ -25,13 +25,18 @@ scorers: model_config: /workspace/evals/gemini_2.5_pro_model.yaml behavioral_metrics: model_config: /workspace/evals/gemini_2.5_pro_model.yaml + skills_best_practices: + model_config: /workspace/evals/gemini_2.5_pro_model.yaml + skills_dir: /workspace/bigquery-data-analytics/skills # Performance turn_count: {} end_to_end_latency: {} tool_call_latency: {} token_consumption: {} + skills_trajectory: {} reporting: bigquery: - gcp_project_id: cloud-db-nl2sql + gcp_project_id: "${EVAL_REPORTING_PROJECT}" + diff --git a/evals/substitute_env.py b/evals/substitute_env.py index ded7c37..4ec23f6 100644 --- a/evals/substitute_env.py +++ b/evals/substitute_env.py @@ -6,7 +6,8 @@ def main(): workspace = os.environ.get('EVAL_WORKSPACE', '/workspace') yaml_paths = [ os.path.join(workspace, 'evals/model_config.yaml'), - os.path.join(workspace, 'evals/run_config.yaml') + os.path.join(workspace, 'evals/run_config.yaml'), + os.path.join(workspace, 'evals/dataset.json') ] for yaml_path in yaml_paths: