Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .github/labels.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -83,4 +83,8 @@

- name: 'release-please:force-run'
color: bdca82
description: Manually trigger the release please workflow on a PR.
description: Manually trigger the release please workflow on a PR.

- name: 'ci:run-evals'
color: 4285f4
description: Manually trigger the evaluation CI pipeline on a PR.
30 changes: 18 additions & 12 deletions cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,7 @@ steps:
- |
set -e

# Only run on release branches
if [[ "$_HEAD_BRANCH" != release-please-* ]]; then
echo "Not a release-please branch. Exiting."
exit 0
fi
echo "Release branch detected. Fetching PR data from GitHub API..."
echo "Fetching PR data from GitHub API..."

# Fetch PR data and status code
HTTP_STATUS=$(curl -s -o pr_data.json -w "%{http_code}" -H "Authorization: token $$GITHUB_TOKEN" \
Expand All @@ -46,28 +41,39 @@ steps:

PR_DATA=$(cat pr_data.json)

# Extract labels and title from PR data (Use $$ to escape bash variables)
PR_LABELS=$(echo "$$PR_DATA" | jq -r '[.labels[].name] | join(",")')
# Extract title from PR data (Use $$ to escape bash variables)
PR_TITLE=$(echo "$$PR_DATA" | jq -r '.title')

# Determine Release Version (Use double quotes and $$ for bash variables)
if [[ "$$PR_LABELS" == *"autorelease: pending"* ]]; then
# Check if execution labels are present using exact matching via jq
if ! jq -e '.labels | any(.name == "autorelease: pending" or .name == "ci:run-evals")' pr_data.json > /dev/null; then
echo "PR does not have 'autorelease: pending' or 'ci:run-evals' label. Skipping execution."
exit 0
fi
echo "Execution label detected. Processing release version context..."

# Determine Release Version based on branch name
if [[ "$_HEAD_BRANCH" == release-please-* ]]; then
if [[ "$$PR_TITLE" =~ release\ ([0-9]+\.[0-9]+\.[0-9]+) ]]; then
export RELEASE_VERSION="$${BASH_REMATCH[1]}"
else
export RELEASE_VERSION="unknown"
export RELEASE_VERSION="pr-$_PR_NUMBER-release-unknown"
fi
else
export RELEASE_VERSION="unknown"
export RELEASE_VERSION="pr-$_PR_NUMBER-ci-run-evals"
fi

# Workaround for evalbench bug: settings are only applied if path basename matches extension ID
ln -s /workspace /workspace/cloud-sql-postgresql
cd /evalbench

# Set environment variables for evalbench
export EVAL_GCP_PROJECT_ID=$PROJECT_ID
export EVAL_GCP_PROJECT_REGION=$_CLOUD_SQL_REGION
export GOOGLE_CLOUD_PROJECT=$PROJECT_ID
export EVAL_REPORTING_PROJECT=$_EVAL_REPORTING_PROJECT


# Set environment variables for extension
export CLOUD_SQL_POSTGRES_PROJECT=$PROJECT_ID
export CLOUD_SQL_POSTGRES_INSTANCE=$_CLOUD_SQL_INSTANCE
export CLOUD_SQL_POSTGRES_REGION=$_CLOUD_SQL_REGION
Expand Down
16 changes: 8 additions & 8 deletions evals/dataset.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@
"scenarios": [
{
"id": "cloud-sql-debug-instance",
"starting_prompt": "Check on my databases in project ext-test-cloud-sql-postgres.",
"conversation_plan": "Ask the agent to list all Cloud SQL instances in the project. Once all instances are listed, if 'daily-ci-evals-db' exists, get its details and validate it is RUNNABLE.",
"starting_prompt": "Check on my databases in project ${GOOGLE_CLOUD_PROJECT}.",
"conversation_plan": "Ask the agent to list all Cloud SQL instances in the project. Once all instances are listed, if '${CLOUD_SQL_POSTGRES_INSTANCE}' exists, get its details and validate it is RUNNABLE.",
"expected_trajectory": [
"list_instances",
"get_instance"
],
"env": {
"GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres"
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
},
"kind": "tools",
"max_turns": 3
Expand All @@ -23,7 +23,7 @@
"list_tables"
],
"env": {
"GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres"
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
},
"kind": "tools",
"max_turns": 3
Expand All @@ -37,21 +37,21 @@
"list_locks"
],
"env": {
"GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres"
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
},
"kind": "tools",
"max_turns": 3
},
{
"id": "cloud-sql-metrics-cpu-investigation",
"starting_prompt": "I'm worried about the database load for daily-ci-evals-db.",
"conversation_plan": "First, ask the agent to check the CPU utilization for the instance 'daily-ci-evals-db' for the last 5 minutes. After the agent provides the CPU data, ask it to check the overall database stats to see connection counts or transaction volume.",
"starting_prompt": "I'm worried about the database load for ${CLOUD_SQL_POSTGRES_INSTANCE}.",
"conversation_plan": "First, ask the agent to check the CPU utilization for the instance '${CLOUD_SQL_POSTGRES_INSTANCE}' for the last 5 minutes. After the agent provides the CPU data, ask it to check the overall database stats to see connection counts or transaction volume.",
"expected_trajectory": [
"get_system_metrics",
"list_database_stats"
],
"env": {
"GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres"
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
},
"kind": "tools",
"max_turns": 3
Expand Down
3 changes: 2 additions & 1 deletion evals/model_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.

gemini_cli_version: "@google/gemini-cli@0.38.1"
gemini_cli_version: "@google/gemini-cli@latest"
generator: gemini_cli
env:
GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}"
GOOGLE_CLOUD_LOCATION: "global"
GOOGLE_GENAI_USE_VERTEXAI: "true"
GEMINI_CLI_TRUST_WORKSPACE: "true"
setup:
extensions:
# Points to the symlink created in cloudbuild.yaml to match the extension ID
Expand Down
7 changes: 6 additions & 1 deletion evals/run_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,18 @@ scorers:
model_config: /workspace/evals/gemini_2.5_pro_model.yaml
behavioral_metrics:
model_config: /workspace/evals/gemini_2.5_pro_model.yaml
skills_best_practices:
model_config: /workspace/evals/gemini_2.5_pro_model.yaml
skills_dir: /workspace/cloud-sql-postgresql/skills

# Performance
turn_count: {}
end_to_end_latency: {}
tool_call_latency: {}
token_consumption: {}
skills_trajectory: {}

reporting:
bigquery:
gcp_project_id: cloud-db-nl2sql
gcp_project_id: "${EVAL_REPORTING_PROJECT}"

2 changes: 1 addition & 1 deletion evals/substitute_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import re

def main():
yaml_paths = ['/workspace/evals/model_config.yaml', '/workspace/evals/run_config.yaml']
yaml_paths = ['/workspace/evals/model_config.yaml', '/workspace/evals/run_config.yaml', '/workspace/evals/dataset.json']
for yaml_path in yaml_paths:
if os.path.exists(yaml_path):
with open(yaml_path, 'r') as f:
Expand Down
Loading