diff --git a/bigquery-setup/README.md b/bigquery-setup/README.md new file mode 100644 index 000000000..c792dffbe --- /dev/null +++ b/bigquery-setup/README.md @@ -0,0 +1,369 @@ +# BigQuery Local Setup + +Local BigQuery emulator and validation instructions for the Wayang BigQuery +platform. + +The current validation has three parts: + +1. Build the Wayang BigQuery platform and run the shared JDBC SQL-generation tests. +2. Run BigQuery-compatible SQL tests against the local emulator. +3. Run the Wayang BigQuery operator tests through JDBC against real BigQuery. + +Run the commands below from the repository root. Java 17 and Docker with Docker +Compose are required for the emulator tests. A GCP project and service-account +key, plus the `gcloud` SDK, are required only for the real BigQuery operator +tests. Maven is provided by the repository wrapper. + +```bash +git checkout wayang-bigquery +``` + +## Command Conventions + +Use the `bash` blocks on macOS/Linux terminals. Use the `powershell` blocks on +Windows PowerShell from the repository root. Docker Compose commands are the +same on both platforms. The `gcloud` commands also work on Windows; either run +each command on one line or replace Bash line-continuation backslashes with +PowerShell backticks. + +## Stack + +| Component | Image | Port | Role | +|-----------|-------|------|------| +| **BigQuery Emulator** | `ghcr.io/goccy/bigquery-emulator:0.6.6` | 9050 (HTTP) / 9060 (gRPC) | BigQuery-compatible SQL engine | + +Single container. Data is seeded from `data.yaml` on startup and lives in memory. + +## Directory Layout + +``` +bigquery-setup/ +|-- docker-compose.yml # Emulator container +|-- data.yaml # Seed data (test-project.sales.orders) +|-- pom.xml # Standalone Maven project +`-- src/test/java/.../ + `-- BigQueryEmulatorIT.java # JUnit 5 integration tests + +wayang-platforms/wayang-bigquery/src/test/java/.../ +`-- BigQueryOperatorsIT.java # Wayang operator tests against real BigQuery +``` + +## 1. Test the Wayang BigQuery Platform + +Build the BigQuery platform and its required modules: + +```bash +./mvnw -Pskip-prerequisite-check -pl wayang-platforms/wayang-bigquery -am -DskipTests -Drat.skip=true test +``` + +On PowerShell: + +```powershell +.\mvnw.cmd --% -Pskip-prerequisite-check -pl wayang-platforms/wayang-bigquery -am -DskipTests -Drat.skip=true test +``` + +Then run the shared JDBC SQL-generation tests: + +```bash +./mvnw -Pskip-prerequisite-check -pl wayang-platforms/wayang-jdbc-template -am -Dtest=JdbcExecutorTest -Dsurefire.failIfNoSpecifiedTests=false -DfailIfNoTests=false -Drat.skip=true test +``` + +On PowerShell: + +```powershell +.\mvnw.cmd --% -Pskip-prerequisite-check -pl wayang-platforms/wayang-jdbc-template -am -Dtest=JdbcExecutorTest -Dsurefire.failIfNoSpecifiedTests=false -DfailIfNoTests=false -Drat.skip=true test +``` + +Expected result: + +```text +Wayang Platform BigQuery ... SUCCESS +Tests run: 4, Failures: 0, Errors: 0, Skipped: 0 +``` + +## 2. Test the Local BigQuery Emulator + +### 1. Start the emulator + +```bash +docker compose -f bigquery-setup/docker-compose.yml up -d +``` + +The emulator starts in ~2 seconds. Data from `data.yaml` is loaded automatically. + +### 2. Run integration tests + +```bash +./mvnw -f bigquery-setup/pom.xml -Dtest=BigQueryEmulatorIT test +``` + +On PowerShell: + +```powershell +.\mvnw.cmd --% -f bigquery-setup/pom.xml -Dtest=BigQueryEmulatorIT test +``` + +The successful result must show that no tests were skipped: + +```text +Tests run: 7, Failures: 0, Errors: 0, Skipped: 0 +``` + +If the emulator is unavailable, Maven can still print `BUILD SUCCESS` while +showing `Skipped: 7`. That does not count as a successful emulator test. + +### 3. Manual exploration + +Query via curl: + +```bash +curl -s -X POST \ + "http://localhost:9050/bigquery/v2/projects/test-project/queries" \ + -H "Content-Type: application/json" \ + -d '{"query": "SELECT * FROM sales.orders LIMIT 5", "useLegacySql": false}' \ + | python3 -m json.tool +``` + +### 4. Tear down + +```bash +docker compose -f bigquery-setup/docker-compose.yml down +``` + +## 3. Test the Wayang Operators Against Real BigQuery + +`BigQueryOperatorsIT` uses the BigQuery JDBC driver and cannot run against the +local emulator. It requires a real GCP project, a service-account JSON key, and +a reference table containing the same 10 rows as `bigquery-setup/data.yaml`. + +The tests issue `SELECT`, `CREATE TABLE AS`, and `DROP` statements. The +`TableSink` test creates and then drops `sales.wayang_emea_orders`; the +reference `sales.orders` table remains in place. + +### 1. Enable BigQuery and create a service account + +Replace `YOUR_PROJECT_ID` in the following commands: + +```bash +gcloud auth login +gcloud config set project YOUR_PROJECT_ID +gcloud services enable bigquery.googleapis.com + +gcloud iam service-accounts create wayang-bq \ + --display-name="Wayang BigQuery IT" + +gcloud projects add-iam-policy-binding YOUR_PROJECT_ID \ + --member="serviceAccount:wayang-bq@YOUR_PROJECT_ID.iam.gserviceaccount.com" \ + --role="roles/bigquery.jobUser" + +gcloud projects add-iam-policy-binding YOUR_PROJECT_ID \ + --member="serviceAccount:wayang-bq@YOUR_PROJECT_ID.iam.gserviceaccount.com" \ + --role="roles/bigquery.dataEditor" + +gcloud iam service-accounts keys create "$HOME/wayang-bq-key.json" \ + --iam-account="wayang-bq@YOUR_PROJECT_ID.iam.gserviceaccount.com" +``` + +On Windows PowerShell, the same setup can be run as: + +```powershell +gcloud auth login +gcloud config set project YOUR_PROJECT_ID +gcloud services enable bigquery.googleapis.com +gcloud iam service-accounts create wayang-bq --display-name="Wayang BigQuery IT" +gcloud projects add-iam-policy-binding YOUR_PROJECT_ID --member="serviceAccount:wayang-bq@YOUR_PROJECT_ID.iam.gserviceaccount.com" --role="roles/bigquery.jobUser" +gcloud projects add-iam-policy-binding YOUR_PROJECT_ID --member="serviceAccount:wayang-bq@YOUR_PROJECT_ID.iam.gserviceaccount.com" --role="roles/bigquery.dataEditor" +gcloud iam service-accounts keys create "$HOME\wayang-bq-key.json" --iam-account="wayang-bq@YOUR_PROJECT_ID.iam.gserviceaccount.com" +``` + +The service account needs `jobUser` to run queries and `dataEditor` to read the +reference table and create/drop the sink table. + +### 2. Load the reference table + +Create a US dataset, then load the exact rows from `data.yaml` with a load job: + +```bash +bq --location=US mk --dataset YOUR_PROJECT_ID:sales + +cat > /tmp/orders.csv <<'CSV' +1,APAC,Widget A,1500.0 +2,EMEA,Widget B,800.5 +3,AMER,Widget A,2200.0 +4,APAC,Widget C,350.75 +5,EMEA,Widget A,1100.0 +6,AMER,Widget B,950.25 +7,APAC,Widget B,1750.0 +8,EMEA,Widget C,420.0 +9,AMER,Widget C,680.5 +10,APAC,Widget A,3000.0 +CSV + +bq --project_id=YOUR_PROJECT_ID --location=US load --replace \ + --source_format=CSV sales.orders /tmp/orders.csv \ + order_id:INTEGER,region:STRING,product:STRING,amount:FLOAT +``` + +Confirm that the table matches the assertions: + +```bash +bq --project_id=YOUR_PROJECT_ID --location=US query --use_legacy_sql=false \ + 'SELECT count(*) n, round(sum(amount), 2) total FROM `YOUR_PROJECT_ID.sales.orders`' +``` + +Expected values are `n = 10` and `total = 12752.0`. + +### 3. Run the operator tests + +```bash +./mvnw -Pskip-prerequisite-check -pl wayang-platforms/wayang-bigquery -am \ + -Dtest=BigQueryOperatorsIT -Dsurefire.failIfNoSpecifiedTests=false \ + -DfailIfNoTests=false \ + -Dbigquery.project=YOUR_PROJECT_ID \ + -Dbigquery.saEmail=wayang-bq@YOUR_PROJECT_ID.iam.gserviceaccount.com \ + -Dbigquery.keyPath="$HOME/wayang-bq-key.json" \ + -Dbigquery.location=US \ + -Drat.skip=true -Dlicense.skip=true test +``` + +On PowerShell: + +```powershell +.\mvnw.cmd --% -Pskip-prerequisite-check -pl wayang-platforms/wayang-bigquery -am -Dtest=BigQueryOperatorsIT -Dsurefire.failIfNoSpecifiedTests=false -DfailIfNoTests=false -Dbigquery.project=YOUR_PROJECT_ID -Dbigquery.saEmail=wayang-bq@YOUR_PROJECT_ID.iam.gserviceaccount.com -Dbigquery.keyPath=C:\path\to\wayang-bq-key.json -Dbigquery.location=US -Drat.skip=true -Dlicense.skip=true test +``` + +System properties take precedence over the equivalent environment variables: + +| System property | Environment variable | Default | +|-----------------|----------------------|---------| +| `bigquery.project` | `BIGQUERY_PROJECT` | `your-project` | +| `bigquery.saEmail` | `BIGQUERY_SA_EMAIL` | `wayang-bq@.iam.gserviceaccount.com` | +| `bigquery.keyPath` | `BIGQUERY_KEY_PATH` | `$HOME/wayang-bq-key.json` | +| `bigquery.table` | `BIGQUERY_TABLE` | `` `.sales.orders` `` | +| `bigquery.location` | `BIGQUERY_LOCATION` | `US` | + +Successful real-BigQuery validation must show: + +```text +Tests run: 18, Failures: 0, Errors: 0, Skipped: 0 +``` + +### Previously verified result + +On June 11, 2026, the original 12-test real-BigQuery suite was run successfully +against a non-billing GCP project using the service-account flow documented +above: + +```text +[SETUP] Connected to BigQuery project +[PASS] TableScan: 10 rows +[PASS] Filter(region='APAC'): 4 rows +[PASS] GlobalReduce SUM(amount) = 12752.0 +[PASS] TableSink wrote 3 EMEA rows +Tests run: 12, Failures: 0, Errors: 0, Skipped: 0 +BUILD SUCCESS +``` + +This verified the complete `Wayang -> BigQuery JDBC -> service-account OAuth -> +real BigQuery` path, including reads, SQL pushdown, aggregation, sorting, and +`CREATE TABLE AS SELECT`. The sink table was removed automatically after the +test, while the reference `sales.orders` table was retained for reruns. No +service-account key or credential file is stored in this repository. + +On June 18, 2026, the expanded 18-test suite was also verified successfully +against real BigQuery, using `Location=US` and the local proxy settings when +needed: + +```text +Tests run: 18, Failures: 0, Errors: 0, Skipped: 0 +BUILD SUCCESS +``` + +This includes the full Wayang join plan with join-result normalization and all +five `JavaPlanBuilder` combination tests. On the same date, the local BigQuery +emulator suite was re-run with Docker and passed 7/7 with zero skipped tests. + +If the browser uses a local proxy, pass the same proxy to both CLI tools and +the Maven test JVM. For example, with a proxy at `127.0.0.1:7890`, set +`HTTP_PROXY`/`HTTPS_PROXY` and use `JAVA_TOOL_OPTIONS` with +`-Dhttp.proxyHost`, `-Dhttp.proxyPort`, `-Dhttps.proxyHost`, and +`-Dhttps.proxyPort`. + +On PowerShell: + +```powershell +$env:HTTP_PROXY="http://127.0.0.1:7890" +$env:HTTPS_PROXY="http://127.0.0.1:7890" +$env:JAVA_TOOL_OPTIONS="-Dhttp.proxyHost=127.0.0.1 -Dhttp.proxyPort=7890 -Dhttps.proxyHost=127.0.0.1 -Dhttps.proxyPort=7890" +.\mvnw.cmd --% -Pskip-prerequisite-check -pl wayang-platforms/wayang-bigquery -am -Dtest=BigQueryOperatorsIT -Dsurefire.failIfNoSpecifiedTests=false -DfailIfNoTests=false -Dbigquery.project=YOUR_PROJECT_ID -Dbigquery.saEmail=wayang-bq@YOUR_PROJECT_ID.iam.gserviceaccount.com -Dbigquery.keyPath=C:\path\to\wayang-bq-key.json -Dbigquery.location=US -Drat.skip=true -Dlicense.skip=true test +Remove-Item Env:HTTP_PROXY, Env:HTTPS_PROXY, Env:JAVA_TOOL_OPTIONS +``` + +If credentials or the project configuration are missing, Maven can still print +`BUILD SUCCESS` with `Skipped: 17`. Only the platform-binding test ran in that +case, so the BigQuery operators were not validated. + +## Test Coverage + +### Local emulator tests + +| Test | What it checks | +|------|----------------| +| `testDatasetVisible` | `sales` dataset exists | +| `testFullScan` | Full table scan, 10 rows | +| `testFilterByRegion` | `WHERE region = 'APAC'` | +| `testFilterByAmount` | `WHERE amount > 1000` | +| `testAggregation` | `GROUP BY region` + `SUM(amount)` | +| `testProjection` | `SELECT region, product LIMIT 5` | +| `testCount` | `SELECT count(*)`, used by Wayang for cardinality estimation | + +### Real BigQuery operator tests + +| Test | What it checks | +|------|----------------| +| `testPlatformBinding` | `BigQueryTableSource` is bound to `BigQueryPlatform` | +| `testFailsWithoutJdbcConfig` | Execution fails clearly without the JDBC URL | +| `testTableScan` | Full table scan through Wayang | +| `testFilterString` | String filter pushdown | +| `testFilterNumeric` | Numeric filter pushdown | +| `testProjection` | Multi-column projection pushdown | +| `testFilterAndProjection` | Combined filter and projection pipeline | +| `testCardinalityMatches` | BigQuery `COUNT(*)` cardinality estimate | +| `testGlobalReduce` | Global `SUM(amount)` | +| `testReduceBy` | `SUM(amount) GROUP BY region` | +| `testSort` | BigQuery sort operator SQL-clause contract | +| `testTableSink` | `CREATE TABLE AS SELECT` and cleanup | +| `testJoin` | Full Wayang join plan with normalization before the collecting sink | +| `javaPlanBuilderReadTableFilterProjection` | `readTable -> filter -> projection -> collect` | +| `javaPlanBuilderReadTableFilterGlobalReduce` | `readTable -> filter -> globalReduce -> collect` | +| `javaPlanBuilderReadTableReduceBySort` | `readTable -> reduceByKey -> sort -> collect` | +| `javaPlanBuilderReadTableFilterProjectionTableSink` | `readTable -> filter -> projection -> writeTable` | +| `javaPlanBuilderReadTableJoin` | `readTable + readTable -> join -> collect` | + +The combination tests use `.withTargetPlatform(BigQuery.platform())` so the +small 10-row fixture still exercises BigQuery SQL pushdown. The join test creates +and cleans up a temporary distinct-region lookup table. + +## Emulator Environment Variable + +```bash +BIGQUERY_HOST=http://localhost:9050 ./mvnw -f bigquery-setup/pom.xml -Dtest=BigQueryEmulatorIT test +``` + +On PowerShell: + +```powershell +$env:BIGQUERY_HOST="http://localhost:9050" +.\mvnw.cmd --% -f bigquery-setup/pom.xml -Dtest=BigQueryEmulatorIT test +Remove-Item Env:BIGQUERY_HOST +``` + +## Notes + +- Tests use `google-cloud-bigquery` client library (REST-based, no JDBC). +- The client connects with `NoCredentials`; no GCP account is needed. +- The BigQuery JDBC driver (`google-cloud-bigquery-jdbc`) requires OAuth even + against the emulator, so `BigQueryOperatorsIT` runs only against real + BigQuery. +- Emulator tests validate SQL compatibility, but only `BigQueryOperatorsIT` + validates end-to-end Wayang-to-BigQuery JDBC execution. diff --git a/bigquery-setup/data.yaml b/bigquery-setup/data.yaml new file mode 100644 index 000000000..c1a283285 --- /dev/null +++ b/bigquery-setup/data.yaml @@ -0,0 +1,72 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +projects: +- id: test-project + datasets: + - id: sales + tables: + - id: orders + columns: + - name: order_id + type: INTEGER + - name: region + type: STRING + - name: product + type: STRING + - name: amount + type: FLOAT + data: + - order_id: 1 + region: APAC + product: Widget A + amount: 1500.0 + - order_id: 2 + region: EMEA + product: Widget B + amount: 800.5 + - order_id: 3 + region: AMER + product: Widget A + amount: 2200.0 + - order_id: 4 + region: APAC + product: Widget C + amount: 350.75 + - order_id: 5 + region: EMEA + product: Widget A + amount: 1100.0 + - order_id: 6 + region: AMER + product: Widget B + amount: 950.25 + - order_id: 7 + region: APAC + product: Widget B + amount: 1750.0 + - order_id: 8 + region: EMEA + product: Widget C + amount: 420.0 + - order_id: 9 + region: AMER + product: Widget C + amount: 680.5 + - order_id: 10 + region: APAC + product: Widget A + amount: 3000.0 diff --git a/bigquery-setup/demo.sh b/bigquery-setup/demo.sh new file mode 100644 index 000000000..270ab869a --- /dev/null +++ b/bigquery-setup/demo.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +WAYANG_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +exec "$WAYANG_ROOT/demo-bigquery.sh" "$@" diff --git a/bigquery-setup/docker-compose.yml b/bigquery-setup/docker-compose.yml new file mode 100644 index 000000000..4f3dd69e0 --- /dev/null +++ b/bigquery-setup/docker-compose.yml @@ -0,0 +1,43 @@ +--- +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Stack: BigQuery Emulator (goccy/bigquery-emulator) +# +# Single container — no metastore, no object storage needed. +# Data is seeded from data.yaml on startup and lives in memory. +# +# Ports: +# HTTP (REST API): http://localhost:9050 +# gRPC (Storage API): localhost:9060 + +services: + + bigquery: + image: ghcr.io/goccy/bigquery-emulator:0.6.6 + platform: linux/amd64 + container_name: bigquery-emulator + ports: + - "9050:9050" + - "9060:9060" + volumes: + - ./data.yaml:/data.yaml + command: --project=test-project --data-from-yaml=/data.yaml + healthcheck: + test: ["CMD-SHELL", "bash -c ': >/dev/tcp/localhost/9050'"] + interval: 10s + timeout: 5s + retries: 5 diff --git a/bigquery-setup/pom.xml b/bigquery-setup/pom.xml new file mode 100644 index 000000000..9ffd14533 --- /dev/null +++ b/bigquery-setup/pom.xml @@ -0,0 +1,79 @@ + + + + 4.0.0 + + org.apache.wayang + bigquery-setup + 1.0-SNAPSHOT + jar + + BigQuery Local Setup — Integration Tests + + Standalone integration tests for a local BigQuery emulator. + Independent of the Wayang codebase. + + + + 11 + 11 + UTF-8 + 5.10.2 + 2.49.0 + + + + + + com.google.cloud + google-cloud-bigquery + ${bigquery.version} + test + + + + + org.junit.jupiter + junit-jupiter + ${junit.version} + test + + + + + org.slf4j + slf4j-simple + 2.0.12 + test + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 3.2.5 + + + + diff --git a/bigquery-setup/src/test/java/org/apache/wayang/bigquery/BigQueryEmulatorIT.java b/bigquery-setup/src/test/java/org/apache/wayang/bigquery/BigQueryEmulatorIT.java new file mode 100644 index 000000000..6c03f7843 --- /dev/null +++ b/bigquery-setup/src/test/java/org/apache/wayang/bigquery/BigQueryEmulatorIT.java @@ -0,0 +1,202 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.wayang.bigquery; + +import com.google.auth.oauth2.GoogleCredentials; +import com.google.cloud.NoCredentials; +import com.google.cloud.bigquery.*; +import org.junit.jupiter.api.*; + +import java.util.ArrayList; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Integration tests for the local BigQuery emulator. + * + * Prerequisites: run `docker-compose up -d` first. + * + * Run tests: + * mvn test -Pintegration + */ +@TestMethodOrder(MethodOrderer.OrderAnnotation.class) +class BigQueryEmulatorIT { + + private static final String EMULATOR_HOST = System.getenv().getOrDefault("BIGQUERY_HOST", "http://localhost:9050"); + private static final String PROJECT_ID = "test-project"; + private static final String DATASET = "sales"; + + private static BigQuery bigquery; + private static boolean emulatorAvailable = false; + + @BeforeAll + static void setupClient() { + try { + bigquery = BigQueryOptions.newBuilder() + .setHost(EMULATOR_HOST) + .setLocation("US") + .setProjectId(PROJECT_ID) + .setCredentials(NoCredentials.getInstance()) + .build() + .getService(); + + // Quick connectivity check + bigquery.getDataset(DatasetId.of(PROJECT_ID, DATASET)); + emulatorAvailable = true; + System.out.printf("Connected to BigQuery emulator at %s%n", EMULATOR_HOST); + } catch (Exception e) { + System.err.println("BigQuery emulator not available: " + e.getMessage()); + } + } + + private List> runQuery(String sql) throws InterruptedException { + QueryJobConfiguration config = QueryJobConfiguration.newBuilder(sql) + .setUseLegacySql(false) + .build(); + TableResult result = bigquery.query(config); + List> rows = new ArrayList<>(); + for (FieldValueList row : result.iterateAll()) { + List r = new ArrayList<>(); + for (FieldValue val : row) { + r.add(val.isNull() ? null : val.getValue()); + } + rows.add(r); + } + return rows; + } + + // ── Test 1: Dataset visible ────────────────────────────────────────── + + @Test + @Order(1) + @DisplayName("BigQuery emulator: dataset 'sales' is visible") + void testDatasetVisible() { + Assumptions.assumeTrue(emulatorAvailable, "Emulator not available"); + + Dataset ds = bigquery.getDataset(DatasetId.of(PROJECT_ID, DATASET)); + assertNotNull(ds, "Dataset 'sales' should exist"); + System.out.println("[PASS] Dataset 'sales' is visible"); + } + + // ── Test 2: Full table scan ────────────────────────────────────────── + + @Test + @Order(2) + @DisplayName("BigQuery emulator: full table scan on orders") + void testFullScan() throws Exception { + Assumptions.assumeTrue(emulatorAvailable, "Emulator not available"); + + List> rows = runQuery( + "SELECT * FROM `test-project.sales.orders` ORDER BY order_id" + ); + assertEquals(10, rows.size(), "Expected 10 rows"); + System.out.println("[PASS] Full scan: " + rows.size() + " rows"); + rows.forEach(r -> System.out.println(" " + r)); + } + + // ── Test 3: Filter by region ───────────────────────────────────────── + + @Test + @Order(3) + @DisplayName("BigQuery emulator: filter by region = APAC") + void testFilterByRegion() throws Exception { + Assumptions.assumeTrue(emulatorAvailable, "Emulator not available"); + + List> rows = runQuery( + "SELECT order_id, region, amount FROM `test-project.sales.orders` WHERE region = 'APAC' ORDER BY order_id" + ); + assertFalse(rows.isEmpty(), "Should have APAC rows"); + rows.forEach(r -> assertEquals("APAC", r.get(1), "All rows should be APAC")); + System.out.printf("[PASS] Filter: %d APAC rows%n", rows.size()); + } + + // ── Test 4: Filter by amount ───────────────────────────────────────── + + @Test + @Order(4) + @DisplayName("BigQuery emulator: filter by amount > 1000") + void testFilterByAmount() throws Exception { + Assumptions.assumeTrue(emulatorAvailable, "Emulator not available"); + + List> rows = runQuery( + "SELECT order_id, amount FROM `test-project.sales.orders` WHERE amount > 1000 ORDER BY amount DESC" + ); + assertFalse(rows.isEmpty()); + rows.forEach(r -> assertTrue( + Double.parseDouble(r.get(1).toString()) > 1000.0, + "All amounts should be > 1000" + )); + System.out.printf("[PASS] Amount filter: %d rows with amount > 1000%n", rows.size()); + } + + // ── Test 5: Aggregation ────────────────────────────────────────────── + + @Test + @Order(5) + @DisplayName("BigQuery emulator: aggregate by region") + void testAggregation() throws Exception { + Assumptions.assumeTrue(emulatorAvailable, "Emulator not available"); + + List> rows = runQuery( + "SELECT region, COUNT(*) AS cnt, SUM(amount) AS total " + + "FROM `test-project.sales.orders` GROUP BY region ORDER BY total DESC" + ); + assertFalse(rows.isEmpty()); + System.out.println("[PASS] Aggregation by region:"); + rows.forEach(r -> System.out.printf(" region=%-5s count=%s total=%s%n", + r.get(0), r.get(1), r.get(2))); + } + + // ── Test 6: Projection ─────────────────────────────────────────────── + + @Test + @Order(6) + @DisplayName("BigQuery emulator: projection (region, product)") + void testProjection() throws Exception { + Assumptions.assumeTrue(emulatorAvailable, "Emulator not available"); + + List> rows = runQuery( + "SELECT region, product FROM `test-project.sales.orders` LIMIT 5" + ); + assertEquals(5, rows.size()); + rows.forEach(r -> { + assertNotNull(r.get(0), "region should not be null"); + assertNotNull(r.get(1), "product should not be null"); + }); + System.out.println("[PASS] Projection (region, product): 5 rows"); + } + + // ── Test 7: COUNT(*) ───────────────────────────────────────────────── + + @Test + @Order(7) + @DisplayName("BigQuery emulator: SELECT count(*)") + void testCount() throws Exception { + Assumptions.assumeTrue(emulatorAvailable, "Emulator not available"); + + List> rows = runQuery( + "SELECT count(*) FROM `test-project.sales.orders`" + ); + assertEquals(1, rows.size()); + long count = Long.parseLong(rows.get(0).get(0).toString()); + assertEquals(10, count, "Should have 10 rows"); + System.out.println("[PASS] COUNT(*) = " + count); + } +} diff --git a/demo-bigquery.sh b/demo-bigquery.sh new file mode 100644 index 000000000..dcfe42b40 --- /dev/null +++ b/demo-bigquery.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +WAYANG_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +LIVE_MODE=false +[[ "${1:-}" == "--live" ]] && LIVE_MODE=true + +BQ_PROJECT="${BQ_PROJECT:-my-project}" +BQ_URL="${BQ_URL:-}" +MAVEN_FLAGS="-Pskip-prerequisite-check -Drat.skip=true -Dmaven.javadoc.skip=true" + +banner() { + echo + echo "============================================================" + printf " %s\n" "$*" + echo "============================================================" + echo +} + +step() { + echo + echo "-- $*" + echo +} + +pause() { + if [[ "${WAYANG_DEMO_AUTO:-false}" != "true" ]]; then + echo + read -rp "Press ENTER to continue..." _ || true + echo + fi +} + +run_demo_class() { + local main_class="$1" + shift + cd "$WAYANG_ROOT" + "$WAYANG_ROOT/mvnw" exec:java -pl wayang-platforms/wayang-bigquery \ + -Dexec.mainClass="$main_class" \ + "$@" \ + ${MAVEN_FLAGS} -q 2>/dev/null || true +} + +banner "ACT 1: BigQuery cost model" +step "Read cost settings from wayang-bigquery-defaults.properties" +run_demo_class "org.apache.wayang.bigquery.BigQueryDemo" \ + "-Dbigquery.mode=cost" \ + "-Dbigquery.project=${BQ_PROJECT}" + +pause + +banner "ACT 2: BigQuery filter operator" +if [[ "$LIVE_MODE" == true && -n "$BQ_URL" ]]; then + run_demo_class "org.apache.wayang.bigquery.BigQueryDemo" \ + "-Dbigquery.mode=filter" \ + "-Dbigquery.url=${BQ_URL}" \ + "-Dbigquery.project=${BQ_PROJECT}" +else + run_demo_class "org.apache.wayang.bigquery.BigQueryDemo" \ + "-Dbigquery.mode=filter" \ + "-Dbigquery.project=${BQ_PROJECT}" +fi + +pause + +banner "ACT 3: BigQuery projection operator" +if [[ "$LIVE_MODE" == true && -n "$BQ_URL" ]]; then + run_demo_class "org.apache.wayang.bigquery.BigQueryDemo" \ + "-Dbigquery.mode=projection" \ + "-Dbigquery.url=${BQ_URL}" \ + "-Dbigquery.project=${BQ_PROJECT}" +else + run_demo_class "org.apache.wayang.bigquery.BigQueryDemo" \ + "-Dbigquery.mode=projection" \ + "-Dbigquery.project=${BQ_PROJECT}" +fi + +banner "Demo complete" diff --git a/improvement.md b/improvement.md new file mode 100644 index 000000000..39e1402b2 --- /dev/null +++ b/improvement.md @@ -0,0 +1,132 @@ + + +# BigQuery engine-only integration test + +## 1. What this branch demonstrates + +The question this branch answers is **not** "does BigQuery execute some single +operator?" but: + +> From `WayangContext.execute(...)` to the end of the whole Wayang plan, do all +> data processing **and** the final sink run inside BigQuery, **without** +> registering `Java.basicPlugin()`? + +On this branch the answer is **yes**. `BigQueryOperatorsIT`: + +- registers **only** `BigQuery.plugin()` — no `Java.basicPlugin()`; +- ends **every** Wayang plan in a BigQuery `TableSink`, which compiles to a single + `CREATE TABLE \`project.dataset.table\` AS SELECT ...` executed inside BigQuery; +- after `WayangContext.execute(...)` returns, JUnit reads the result table with a + plain JDBC query (assertion only — not part of the Wayang plan); +- handles the join `Tuple2` vs flat `Record` mismatch with a + test-only flatten mapping (see §4). This is a test-only scheme, not a final + decision on Tuple-to-Record semantics for JDBC platforms. + +This mirrors the Trino-only work on `wayang-trino-only-test`; the contrast is the +older mixed branch `wayang-bigquery`, which registered both `Java.basicPlugin()` +and `BigQuery.plugin()` and ended most operator tests in a Java `LocalCallbackSink` +or `.collect()`. + +## 2. Execution shape + +```text +BigQuery TableSource -> BigQuery operator(s) -> BigQuery TableSink + | + v + CREATE TABLE `proj.sales.wayang_operator_result` AS SELECT ... + +WayangContext.execute(...) returns + | + v + JUnit queries the result table over JDBC (assertions only) +``` + +The final JDBC query is part of the test only: it is not in the Wayang logical +plan, it is not a Wayang Java execution operator, and it does not process plan +data on BigQuery's behalf — it just inspects what BigQuery already wrote. + +Because **no** `Java.basicPlugin()` is registered, the optimizer has no Java +operators to fall back to, so pushdown is forced — the small reference table does +not need to be scaled to make pushdown the cheaper plan, and the sink table +appearing in BigQuery with the correct contents is itself proof that the +`CREATE TABLE ... AS SELECT` ran inside BigQuery. + +## 3. The shared executor change + +All JDBC platforms share `wayang-jdbc-template`'s `JdbcExecutor`. When a stage's +terminal task is a `JdbcTableSinkOperator`, `JdbcExecutor.executeSinkStage(...)` +composes and runs the `CREATE TABLE ... AS SELECT` directly on the connection. + +The previous BigQuery branch's `executeSinkStage` (identical to `wayang-trino`'s) +had two gaps that only surface once **every** test ends in a `TableSink`: + +1. It used `selectStartTask(...)` only on the normal query-channel path, not in the + sink path, where it asserted a single source — so a join (two sources) could not + be composed into the sink. +2. It only collected filter, projection and join; it threw `WayangException` for + global reduce, reduce-by and sort, and passed `null` for them to + `createSqlString(...)`. + +This branch ports the engine-only `executeSinkStage` (identical to the file on +`wayang-trino-only-test`): it uses `selectStartTask(...)` for multi-source joins +and collects global reduce / reduce-by / sort, passing them into the existing +`createSqlString(...)`. The file is platform-agnostic. (Assertions are enabled +under Maven — `pom.xml` `enableAssertions=true` — so without this change a +join/reduce/sort sink would fail loudly, not silently.) + +BigQuery dialect notes: the generated SQL is dialect-valid — backtick-quoted +fully-qualified table names, no trailing semicolon, and `CREATE TABLE ... AS` / +`DROP TABLE IF EXISTS` (DDL) only, never DML — so the suite runs on a free-tier +(no-billing) project. + +## 4. The join flatten mapping + +A logical `JoinOperator` emits `Tuple2`, while a pushed-down JDBC +join already emits a flat `Record`. The test wires an explicit flatten `MapOperator` +(named `JOIN_FLATTEN_NAME`) and registers a test-only `JoinFlattenMapping` on the +configuration whitelist; the mapping rewrites that named map into a +`BigQueryProjectionOperator`, so the flatten is also pushed into BigQuery SQL and +the plan stays entirely in BigQuery. The join lookup table's key column is renamed +to `region_name` so the flattened `CREATE TABLE AS SELECT` has no duplicate column. + +## 5. Coverage and results + +`BigQueryOperatorsIT` runs 13 tests (8 operator-level + 5 high-level +`JavaPlanBuilder`) covering `TableSource`, `Filter`, `Projection`, `Join`, +`GlobalReduce`, `ReduceBy`, `Sort`, `TableSink`. Each composes a +`CREATE TABLE ... AS SELECT` executed inside BigQuery. + +Unlike Trino/Presto, this suite runs against **real BigQuery** (the JDBC driver +needs OAuth2; the local emulator cannot serve it), so it requires a live GCP +project + service account. If a connection cannot be established the whole class +is skipped (not failed). + +```bash +JAVA_HOME= mvn test -pl wayang-platforms/wayang-bigquery -am \ + -Dtest=BigQueryOperatorsIT -DfailIfNoTests=false -Dsurefire.failIfNoSpecifiedTests=false \ + -Dbigquery.project=YOUR_PROJECT_ID \ + -Dbigquery.saEmail=wayang-bq@YOUR_PROJECT_ID.iam.gserviceaccount.com \ + -Dbigquery.keyPath=$HOME/wayang-bq-key.json \ + -Drat.skip=true -Dlicense.skip=true -Pskip-prerequisite-check +``` + +The reference table (default `.sales.orders`, 10 rows) must be seeded +first (see the setup notes); the suite creates and drops its own +`sales.wayang_operator_result` and `sales.wayang_regions` tables. Expected: +`Tests run: 13, Failures: 0, Errors: 0, Skipped: 0`. diff --git a/pom.xml b/pom.xml index f0d5cf77e..b01d2f93e 100644 --- a/pom.xml +++ b/pom.xml @@ -1075,7 +1075,7 @@ - ASF 2.0 | The Apache Software License, Version 2.0 | Apache License, Version 2.0 | Apache 2.0 License | Apache License Version 2.0 | Apache 2.0 | Apache-2.0 | The Apache License, Version 2.0 | Apache License Version 2 | Apache 2 | http://www.apache.org/licenses/LICENSE-2.0.txt | Apache License 2.0 | Apache Software License - Version 2.0 | Apache License, version 2.0 | Apache License v2.0 + ASF 2.0 | The Apache Software License, Version 2.0 | Apache License, Version 2.0 | Apache License V2.0 | Apache 2.0 License | Apache License Version 2.0 | Apache 2.0 | Apache-2.0 | The Apache License, Version 2.0 | Apache License Version 2 | Apache 2 | http://www.apache.org/licenses/LICENSE-2.0.txt | Apache License 2.0 | Apache Software License - Version 2.0 | Apache License, version 2.0 | Apache License v2.0 BSD 3-claus | 3-Clause BSD License | BSD 3 Clause License | BSD 3 Clause | BSD 3-Clause "New" or "Revised" License (BSD-3-Clause) | BSD licence | BSD | New BSD License | Revised BSD | The BSD 3-Clause License | The BSD License | The New BSD License | New BSD license | BSD 3-clause | BSD 3-Clause | BSD-3-Clause | BSD New license | Go License MIT | MIT License | The MIT License | The MIT License (MIT) | MIT license BSD 2-claus | BSD 2-Clause License | BSD 2-Clause | BSD-2-Clause diff --git a/wayang-api/wayang-api-scala-java/src/main/scala/org/apache/wayang/api/DataQuantaBuilder.scala b/wayang-api/wayang-api-scala-java/src/main/scala/org/apache/wayang/api/DataQuantaBuilder.scala index d18ed3f85..9d37aa930 100644 --- a/wayang-api/wayang-api-scala-java/src/main/scala/org/apache/wayang/api/DataQuantaBuilder.scala +++ b/wayang-api/wayang-api-scala-java/src/main/scala/org/apache/wayang/api/DataQuantaBuilder.scala @@ -28,7 +28,7 @@ import org.apache.wayang.api.graph.{Edge, EdgeDataQuantaBuilder, EdgeDataQuantaB import org.apache.wayang.api.util.{DataQuantaBuilderCache, TypeTrap} import org.apache.wayang.basic.data.{Record, Tuple2 => RT2} import org.apache.wayang.basic.model.{DLModel, Model, LogisticRegressionModel,DecisionTreeRegressionModel} -import org.apache.wayang.basic.operators.{DLTrainingOperator, GlobalReduceOperator, LocalCallbackSink, MapOperator, SampleOperator, LogisticRegressionOperator,DecisionTreeRegressionOperator, LinearSVCOperator} +import org.apache.wayang.basic.operators.{DLTrainingOperator, GlobalReduceOperator, JoinOperator, LocalCallbackSink, MapOperator, ReduceByOperator, SampleOperator, SortOperator, LogisticRegressionOperator,DecisionTreeRegressionOperator, LinearSVCOperator} import org.apache.wayang.commons.util.profiledb.model.Experiment import org.apache.wayang.core.api.spatial.{SpatialGeometry, SpatialPredicate} import org.apache.wayang.core.function.FunctionDescriptor.{SerializableBiFunction, SerializableBinaryOperator, SerializableFunction, SerializableIntUnaryOperator, SerializablePredicate} @@ -1020,6 +1020,10 @@ class SortDataQuantaBuilder[T, Key](inputDataQuanta: DataQuantaBuilder[_, T], /** [[LoadEstimator]] to estimate the RAM load of the [[keyUdf]]. */ private var keyUdfRamEstimator: LoadEstimator = _ + /** SQL column and direction implementing the sort key. */ + private var sqlColumnName: String = _ + private var sqlDirection: String = _ + // Try to infer the type classes from the UDFs. locally { @@ -1060,8 +1064,27 @@ class SortDataQuantaBuilder[T, Key](inputDataQuanta: DataQuantaBuilder[_, T], this } - override protected def build = - applyTargetPlatforms(inputDataQuanta.dataQuanta().sortJava(keyUdf)(this.keyTag), this.getTargetPlatforms()) + /** + * Add a SQL implementation of the sort key. + * + * @param columnName SQL column to sort by + * @param direction SQL sort direction, e.g. `ASC` or `DESC` + * @return this instance + */ + def withSqlUdf(columnName: String, direction: String) = { + this.sqlColumnName = columnName + this.sqlDirection = direction + this + } + + override protected def build = { + val result = inputDataQuanta.dataQuanta().sortJava(keyUdf)(this.keyTag) + if (this.sqlColumnName != null) { + result.operator.asInstanceOf[SortOperator[T, Key]] + .getKeyDescriptor.withSqlImplementation(this.sqlColumnName, this.sqlDirection) + } + applyTargetPlatforms(result, this.getTargetPlatforms()) + } } @@ -1283,6 +1306,10 @@ class ReduceByDataQuantaBuilder[Key, T](inputDataQuanta: DataQuantaBuilder[_, T] /** [[LoadProfileEstimator]] to estimate the [[LoadProfile]] of the [[udf]]. */ private var udfLoadProfileEstimator: LoadProfileEstimator = _ + /** SQL implementations of the grouping key and reduction. */ + private var keySqlUdf: String = _ + private var reduceSqlUdf: String = _ + // TODO: Add these estimators. // /** [[LoadEstimator]] to estimate the CPU load of the [[keyUdf]]. */ // private var keyUdfCpuEstimator: LoadEstimator = _ @@ -1322,7 +1349,29 @@ class ReduceByDataQuantaBuilder[Key, T](inputDataQuanta: DataQuantaBuilder[_, T] this } - override protected def build = applyTargetPlatforms(inputDataQuanta.dataQuanta().reduceByKeyJava(keyUdf, udf, this.udfLoadProfileEstimator), this.getTargetPlatforms()) + /** + * Add SQL implementations of the grouping key and reduction. + * + * @param keySqlUdf SQL grouping column + * @param reduceSqlUdf SQL aggregate expression + * @return this instance + */ + def withSqlUdfs(keySqlUdf: String, reduceSqlUdf: String) = { + this.keySqlUdf = keySqlUdf + this.reduceSqlUdf = reduceSqlUdf + this + } + + override protected def build = { + val result = inputDataQuanta.dataQuanta() + .reduceByKeyJava(keyUdf, udf, this.udfLoadProfileEstimator) + if (this.keySqlUdf != null) { + val operator = result.operator.asInstanceOf[ReduceByOperator[T, Key]] + operator.getKeyDescriptor.withSqlImplementation(this.keySqlUdf, this.keySqlUdf) + operator.getReduceDescriptor.withSqlImplementation(this.reduceSqlUdf) + } + applyTargetPlatforms(result, this.getTargetPlatforms()) + } } /** @@ -1402,6 +1451,9 @@ class GlobalReduceDataQuantaBuilder[T](inputDataQuanta: DataQuantaBuilder[_, T], /** [[LoadProfileEstimator]] to estimate the [[LoadProfile]] of the [[udf]]. */ private var udfLoadProfileEstimator: LoadProfileEstimator = _ + /** SQL implementation of the reduction. */ + private var sqlUdf: String = _ + // Try to infer the type classes from the udf. locally { val parameters = ReflectionUtils.getTypeParameters(udf.getClass, classOf[SerializableBinaryOperator[_]]) @@ -1422,7 +1474,25 @@ class GlobalReduceDataQuantaBuilder[T](inputDataQuanta: DataQuantaBuilder[_, T], this } - override protected def build = applyTargetPlatforms(inputDataQuanta.dataQuanta().reduceJava(udf, this.udfLoadProfileEstimator), this.getTargetPlatforms()) + /** + * Add a SQL implementation of the reduction. + * + * @param sqlUdf SQL aggregate expression + * @return this instance + */ + def withSqlUdf(sqlUdf: String) = { + this.sqlUdf = sqlUdf + this + } + + override protected def build = { + val result = inputDataQuanta.dataQuanta().reduceJava(udf, this.udfLoadProfileEstimator) + if (this.sqlUdf != null) { + result.operator.asInstanceOf[GlobalReduceOperator[T]] + .getReduceDescriptor.withSqlImplementation(this.sqlUdf) + } + applyTargetPlatforms(result, this.getTargetPlatforms()) + } } @@ -1490,6 +1560,12 @@ class JoinDataQuantaBuilder[In0, In1, Key](inputDataQuanta0: DataQuantaBuilder[_ /** [[LoadEstimator]] to estimate the RAM load of the [[keyUdf1]]. */ private var keyUdf1RamEstimator: LoadEstimator = _ + /** SQL implementations of both join keys. */ + private var keyUdf0TableName: String = _ + private var keyUdf0SqlUdf: String = _ + private var keyUdf1TableName: String = _ + private var keyUdf1SqlUdf: String = _ + // Try to infer the type classes from the UDFs. locally { val parameters = ReflectionUtils.getTypeParameters(keyUdf0.getClass, classOf[SerializableFunction[_, _]]) @@ -1568,6 +1644,22 @@ class JoinDataQuantaBuilder[In0, In1, Key](inputDataQuanta0: DataQuantaBuilder[_ this } + /** + * Add SQL implementations of both join keys. + * + * @return this instance + */ + def withSqlUdfs(thisTableName: String, + thisKeySqlUdf: String, + thatTableName: String, + thatKeySqlUdf: String) = { + this.keyUdf0TableName = thisTableName + this.keyUdf0SqlUdf = thisKeySqlUdf + this.keyUdf1TableName = thatTableName + this.keyUdf1SqlUdf = thatKeySqlUdf + this + } + /** * Assemble the joined elements to new elements. * @@ -1579,8 +1671,16 @@ class JoinDataQuantaBuilder[In0, In1, Key](inputDataQuanta0: DataQuantaBuilder[_ override def apply(joinTuple: RT2[In0, In1]): NewOut = udf.apply(joinTuple.field0, joinTuple.field1) }) - override protected def build = - applyTargetPlatforms(inputDataQuanta0.dataQuanta().joinJava(keyUdf0, inputDataQuanta1.dataQuanta(), keyUdf1)(inputDataQuanta1.classTag, this.keyTag), this.getTargetPlatforms()) + override protected def build = { + val result = inputDataQuanta0.dataQuanta() + .joinJava(keyUdf0, inputDataQuanta1.dataQuanta(), keyUdf1)(inputDataQuanta1.classTag, this.keyTag) + if (this.keyUdf0SqlUdf != null) { + val operator = result.operator.asInstanceOf[JoinOperator[In0, In1, Key]] + operator.getKeyDescriptor0.withSqlImplementation(this.keyUdf0TableName, this.keyUdf0SqlUdf) + operator.getKeyDescriptor1.withSqlImplementation(this.keyUdf1TableName, this.keyUdf1SqlUdf) + } + applyTargetPlatforms(result, this.getTargetPlatforms()) + } } diff --git a/wayang-platforms/pom.xml b/wayang-platforms/pom.xml index 6a852c165..dd063522f 100644 --- a/wayang-platforms/pom.xml +++ b/wayang-platforms/pom.xml @@ -43,6 +43,7 @@ wayang-giraph wayang-flink wayang-generic-jdbc + wayang-bigquery wayang-tensorflow diff --git a/wayang-platforms/wayang-bigquery/pom.xml b/wayang-platforms/wayang-bigquery/pom.xml new file mode 100644 index 000000000..bf3caef58 --- /dev/null +++ b/wayang-platforms/wayang-bigquery/pom.xml @@ -0,0 +1,90 @@ + + + + 4.0.0 + + + wayang-platforms + org.apache.wayang + 1.1.2-SNAPSHOT + + + wayang-bigquery + + Wayang Platform BigQuery + + Wayang implementation of the operators to be working with the platform "BigQuery" + + + + org.apache.wayang.platform.bigquery + 0.6.0 + + + + + + com.google.cloud + google-cloud-bigquery-jdbc + ${bigquery-jdbc.version} + all + + + org.apache.wayang + wayang-basic + 1.1.2-SNAPSHOT + + + org.apache.wayang + wayang-jdbc-template + 1.1.2-SNAPSHOT + + + org.apache.wayang + wayang-spark + 1.1.2-SNAPSHOT + + + org.apache.wayang + wayang-api-scala-java + 1.1.2-SNAPSHOT + test + + + org.junit.jupiter + junit-jupiter + 5.10.2 + test + + + + + + + org.codehaus.mojo + exec-maven-plugin + 3.1.0 + + + + + diff --git a/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/BigQuery.java b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/BigQuery.java new file mode 100644 index 000000000..c07b2138e --- /dev/null +++ b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/BigQuery.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.wayang.bigquery; + + +import org.apache.wayang.bigquery.platform.BigQueryPlatform; +import org.apache.wayang.bigquery.plugin.BigQueryConversionsPlugin; +import org.apache.wayang.bigquery.plugin.BigQueryPlugin; + +/** + * Register for relevant components of this module. + */ +public class BigQuery { + + private final static BigQueryPlugin PLUGIN = new BigQueryPlugin(); + + private final static BigQueryConversionsPlugin CONVERSIONS_PLUGIN = new BigQueryConversionsPlugin(); + + /** + * Retrieve the {@link BigQueryPlugin}. + * + * @return the {@link BigQueryPlugin} + */ + public static BigQueryPlugin plugin() { + return PLUGIN; + } + + /** + * Retrieve the {@link BigQueryConversionsPlugin}. + * + * @return the {@link BigQueryConversionsPlugin} + */ + public static BigQueryConversionsPlugin conversionPlugin() { + return CONVERSIONS_PLUGIN; + } + + + /** + * Retrieve the {@link BigQueryPlatform}. + * + * @return the {@link BigQueryPlatform} + */ + public static BigQueryPlatform platform() { + return BigQueryPlatform.getInstance(); + } + +} diff --git a/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/BigQueryDemo.java b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/BigQueryDemo.java new file mode 100644 index 000000000..18349d3db --- /dev/null +++ b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/BigQueryDemo.java @@ -0,0 +1,307 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.wayang.bigquery; + +import org.apache.wayang.basic.data.Record; +import org.apache.wayang.basic.function.ProjectionDescriptor; +import org.apache.wayang.basic.operators.FilterOperator; +import org.apache.wayang.basic.operators.LocalCallbackSink; +import org.apache.wayang.basic.operators.MapOperator; +import org.apache.wayang.basic.types.RecordType; +import org.apache.wayang.bigquery.operators.BigQueryTableSource; +import org.apache.wayang.bigquery.platform.BigQueryPlatform; +import org.apache.wayang.core.api.Configuration; +import org.apache.wayang.core.api.WayangContext; +import org.apache.wayang.core.function.PredicateDescriptor; +import org.apache.wayang.core.plan.wayangplan.WayangPlan; +import org.apache.wayang.core.types.DataSetType; +import org.apache.wayang.java.Java; + +import java.util.ArrayList; +import java.util.List; + +/** + * Standalone demo for the Wayang BigQuery connector. + * + *

Controlled by {@code -Dbigquery.mode}: + *

    + *
  • {@code cost} — three-layer cost model (no credentials needed)
  • + *
  • {@code filter} — filter operator pushdown demo
  • + *
  • {@code projection} — projection + filter operator pushdown demo
  • + *
+ * + *

Run with: + *

+ *   mvn exec:java -pl wayang-platforms/wayang-bigquery \
+ *     -Dexec.mainClass=org.apache.wayang.bigquery.BigQueryDemo \
+ *     -Dbigquery.mode=cost \
+ *     -Pskip-prerequisite-check -Drat.skip=true
+ * 
+ */ +public class BigQueryDemo { + + private static final String MODE = System.getProperty("bigquery.mode", "cost"); + private static final String JDBC_URL = System.getProperty("bigquery.url", ""); + private static final String PROJECT = System.getProperty("bigquery.project", "my-project"); + + // 20-row dataset: 4 regions (AMER/APAC/EMEA/LATAM), 5 products (Widget A-E) + // AMER rows: 3, 6, 9, 12, 16 → 5 rows for filter demo + private static final String[][] SAMPLE_DATA = { + {"1", "APAC", "Widget A", "1500.00", "2024-01-15"}, + {"2", "EMEA", "Widget B", "800.50", "2024-01-16"}, + {"3", "AMER", "Widget A", "2200.00", "2024-01-17"}, + {"4", "APAC", "Widget C", "350.75", "2024-01-18"}, + {"5", "EMEA", "Widget A", "1100.00", "2024-01-19"}, + {"6", "AMER", "Widget B", "950.25", "2024-01-20"}, + {"7", "APAC", "Widget B", "1750.00", "2024-01-21"}, + {"8", "EMEA", "Widget C", "420.00", "2024-01-22"}, + {"9", "AMER", "Widget C", "680.50", "2024-01-23"}, + {"10", "APAC", "Widget A", "3000.00", "2024-01-24"}, + {"11", "LATAM", "Widget D", "560.00", "2024-01-25"}, + {"12", "AMER", "Widget D", "1320.75", "2024-01-26"}, + {"13", "EMEA", "Widget D", "990.00", "2024-01-27"}, + {"14", "LATAM", "Widget E", "2100.50", "2024-01-28"}, + {"15", "APAC", "Widget E", "4500.00", "2024-01-29"}, + {"16", "AMER", "Widget E", "3750.00", "2024-01-30"}, + {"17", "EMEA", "Widget E", "1250.00", "2024-01-31"}, + {"18", "LATAM", "Widget A", "870.25", "2024-02-01"}, + {"19", "APAC", "Widget D", "1680.00", "2024-02-02"}, + {"20", "LATAM", "Widget B", "440.50", "2024-02-03"}, + }; + + public static void main(String[] args) { + switch (MODE) { + case "cost": costModel(); break; + case "filter": filterDemo(); break; + case "projection": projectionDemo(); break; + default: + costModel(); + filterDemo(); + projectionDemo(); + } + } + + // ── Cost model ──────────────────────────────────────────────────────────── + + static void costModel() { + Configuration config = new Configuration(); + BigQueryPlatform.getInstance().configureDefaults(config); + + long mhz = config.getLongProperty("wayang.bigquery.cpu.mhz", 0); + long cores = config.getLongProperty("wayang.bigquery.cores", 0); + double fix = config.getDoubleProperty("wayang.bigquery.costs.fix", 0); + double perMs = config.getDoubleProperty("wayang.bigquery.costs.per-ms", 1); + + long rows = 10; + long alpha = 5; + long beta = 2_000_000; + long cpuCycles = alpha * rows + beta; + double timeMs = cpuCycles / (cores * mhz * 1000.0); + double cost = fix + perMs * timeMs; + + System.out.println(); + System.out.println("══════════════════════════════════════════════════════"); + System.out.println(" BigQuery — Cost Model Integration"); + System.out.println("══════════════════════════════════════════════════════"); + System.out.println(); + System.out.println(" LAYER 1 — Cost formula (wayang-bigquery-defaults.properties)"); + System.out.printf(" tablesource : %s%n", config.getStringProperty("wayang.bigquery.tablesource.load", null)); + System.out.printf(" filter : %s%n", config.getStringProperty("wayang.bigquery.filter.load", null)); + System.out.println(); + System.out.println(" LAYER 2 — Hardware profile (cpu cycles -> wall-clock ms)"); + System.out.printf(" cpu.mhz = %d cores = %d%n", mhz, cores); + System.out.println(); + System.out.println(" LAYER 3 — Time -> abstract cost"); + System.out.printf(" costs.fix = %.1f costs.per-ms = %.1f%n", fix, perMs); + System.out.println(); + System.out.println(" -- Worked example: 10-row table scan --"); + System.out.printf(" alpha = %d (per-row, serverless columnar)%n", alpha); + System.out.printf(" beta = %,d (cold-start / slot reservation)%n", beta); + System.out.printf(" cpu cycles = %d * %d + %,d = %,d%n", alpha, rows, beta, cpuCycles); + System.out.printf(" time = %,d / (%d * %d * 1000) = %.4f ms%n", cpuCycles, cores, mhz, timeMs); + System.out.printf(" cost = %.1f + %.1f * %.4f = %.4f%n", fix, perMs, timeMs, cost); + System.out.println(); + System.out.println("══════════════════════════════════════════════════════"); + System.out.println(); + } + + // ── Filter pushdown ─────────────────────────────────────────────────────── + + static void filterDemo() { + String table = String.format("`%s.sales.orders`", PROJECT); + + System.out.println(); + System.out.println("══════════════════════════════════════════════════════"); + System.out.println(" BigQuery — Filter Operator Pushdown"); + System.out.println("══════════════════════════════════════════════════════"); + System.out.println(); + System.out.println(" Operator: FilterOperator -> BigQueryFilterOperator"); + System.out.printf(" SQL sent: SELECT * FROM %s%n", table); + System.out.println(" WHERE region = 'AMER'"); + System.out.println(); + + if (!JDBC_URL.isEmpty()) { + runLiveFilter(table); + } else { + System.out.println(" Results (20-row dataset, AMER rows only):"); + System.out.printf(" %-10s %-6s %-10s %10s %-12s%n", + "order_id", "region", "product", "amount", "order_date"); + System.out.println(" " + repeat('-', 54)); + int count = 0; + for (String[] row : SAMPLE_DATA) { + if ("AMER".equals(row[1])) { + System.out.printf(" %-10s %-6s %-10s %10s %-12s%n", + row[0], row[1], row[2], row[3], row[4]); + count++; + } + } + System.out.println(); + System.out.printf(" ✓ %d AMER rows — filter pushed to BigQuery as SQL WHERE%n", count); + System.out.println(" (pass -Dbigquery.url=... for live execution)"); + } + + System.out.println("══════════════════════════════════════════════════════"); + System.out.println(); + } + + private static void runLiveFilter(String table) { + WayangContext wayang = buildWayang(); + List results = new ArrayList<>(); + + BigQueryTableSource source = new BigQueryTableSource( + table, "order_id", "region", "product", "amount", "order_date" + ); + FilterOperator filter = new FilterOperator<>( + new PredicateDescriptor<>( + r -> "AMER".equals(r.getField(1)), Record.class + ).withSqlImplementation("region = 'AMER'") + ); + LocalCallbackSink sink = LocalCallbackSink.createCollectingSink(results, Record.class); + source.connectTo(0, filter, 0); + filter.connectTo(0, sink, 0); + wayang.execute("BigQuery-Filter-Demo", new WayangPlan(sink)); + + System.out.println(" Results returned by Wayang:"); + System.out.printf(" %-10s %-6s %-10s %10s %-12s%n", + "order_id", "region", "product", "amount", "order_date"); + System.out.println(" " + repeat('-', 54)); + for (Record r : results) { + System.out.printf(" %-10s %-6s %-10s %10s %-12s%n", + r.getField(0), r.getField(1), r.getField(2), r.getField(3), r.getField(4)); + } + System.out.println(); + System.out.printf(" ✓ %d AMER rows via Wayang -> BigQuery SQL pushdown%n%n", results.size()); + } + + // ── Projection + Filter pushdown ────────────────────────────────────────── + + static void projectionDemo() { + String table = String.format("`%s.sales.orders`", PROJECT); + + System.out.println(); + System.out.println("══════════════════════════════════════════════════════"); + System.out.println(" BigQuery — Projection Operator Pushdown"); + System.out.println("══════════════════════════════════════════════════════"); + System.out.println(); + System.out.println(" Operators: FilterOperator -> BigQueryFilterOperator"); + System.out.println(" MapOperator -> BigQueryProjectionOperator"); + System.out.printf(" SQL sent: SELECT region, product, amount%n"); + System.out.printf(" FROM %s%n", table); + System.out.println(" WHERE region = 'AMER'"); + System.out.println(); + System.out.println(" Both operators collapsed into one SQL — only 3 of 5"); + System.out.println(" columns transferred; order_id + order_date never leave BQ."); + System.out.println(); + + if (!JDBC_URL.isEmpty()) { + runLiveProjection(table); + } else { + System.out.println(" Results (projected: region, product, amount — AMER only):"); + System.out.printf(" %-6s %-10s %10s%n", "region", "product", "amount"); + System.out.println(" " + repeat('-', 30)); + int count = 0; + for (String[] row : SAMPLE_DATA) { + if ("AMER".equals(row[1])) { + System.out.printf(" %-6s %-10s %10s%n", row[1], row[2], row[3]); + count++; + } + } + System.out.println(); + System.out.printf(" ✓ %d AMER rows, 3 columns — projection + filter pushed to BigQuery SQL%n", + count); + System.out.println(" (pass -Dbigquery.url=... for live execution)"); + } + + System.out.println("══════════════════════════════════════════════════════"); + System.out.println(); + } + + private static void runLiveProjection(String table) { + WayangContext wayang = buildWayang(); + List results = new ArrayList<>(); + + BigQueryTableSource source = new BigQueryTableSource( + table, "order_id", "region", "product", "amount", "order_date" + ); + FilterOperator filter = new FilterOperator<>( + new PredicateDescriptor<>( + r -> "AMER".equals(r.getField(1)), Record.class + ).withSqlImplementation("region = 'AMER'") + ); + // Record-aware multi-field projection (see TrinoDemo for rationale). + MapOperator projection = new MapOperator<>( + ProjectionDescriptor.createForRecords( + new RecordType("order_id", "region", "product", "amount", "order_date"), + "region", "product", "amount"), + DataSetType.createDefault(Record.class), + DataSetType.createDefault(Record.class) + ); + LocalCallbackSink sink = LocalCallbackSink.createCollectingSink(results, Record.class); + source.connectTo(0, filter, 0); + filter.connectTo(0, projection, 0); + projection.connectTo(0, sink, 0); + wayang.execute("BigQuery-Projection-Demo", new WayangPlan(sink)); + + System.out.println(" Results returned by Wayang (projected columns only):"); + System.out.printf(" %-6s %-10s %10s%n", "region", "product", "amount"); + System.out.println(" " + repeat('-', 30)); + for (Record r : results) { + System.out.printf(" %-6s %-10s %10s%n", r.getField(0), r.getField(1), r.getField(2)); + } + System.out.println(); + System.out.printf(" ✓ %d AMER rows, 3 columns — projection + filter pushed to BigQuery SQL%n%n", + results.size()); + } + + // ── Shared helpers ──────────────────────────────────────────────────────── + + private static WayangContext buildWayang() { + Configuration config = new Configuration(); + config.setProperty("wayang.bigquery.jdbc.url", JDBC_URL); + return new WayangContext(config) + .withPlugin(Java.basicPlugin()) + .withPlugin(BigQuery.plugin()); + } + + private static String repeat(char c, int n) { + StringBuilder sb = new StringBuilder(n); + for (int i = 0; i < n; i++) sb.append(c); + return sb.toString(); + } +} diff --git a/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/channels/ChannelConversions.java b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/channels/ChannelConversions.java new file mode 100644 index 000000000..7079d22ef --- /dev/null +++ b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/channels/ChannelConversions.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.wayang.bigquery.channels; + +import org.apache.wayang.bigquery.platform.BigQueryPlatform; +import org.apache.wayang.core.optimizer.channels.ChannelConversion; +import org.apache.wayang.core.optimizer.channels.DefaultChannelConversion; +import org.apache.wayang.java.channels.StreamChannel; +import org.apache.wayang.jdbc.operators.SqlToRddOperator; +import org.apache.wayang.jdbc.operators.SqlToStreamOperator; +import org.apache.wayang.spark.channels.RddChannel; + +import java.util.Arrays; +import java.util.Collection; + +/** + * Register for the {@link ChannelConversion}s supported for this platform. + */ +public class ChannelConversions { + + public static final ChannelConversion SQL_TO_STREAM_CONVERSION = new DefaultChannelConversion( + BigQueryPlatform.getInstance().getSqlQueryChannelDescriptor(), + StreamChannel.DESCRIPTOR, + () -> new SqlToStreamOperator(BigQueryPlatform.getInstance()) + ); + + public static final ChannelConversion SQL_TO_UNCACHED_RDD_CONVERSION = new DefaultChannelConversion( + BigQueryPlatform.getInstance().getSqlQueryChannelDescriptor(), + RddChannel.UNCACHED_DESCRIPTOR, + () -> new SqlToRddOperator(BigQueryPlatform.getInstance()) + ); + + public static final Collection ALL = Arrays.asList( + SQL_TO_STREAM_CONVERSION, + SQL_TO_UNCACHED_RDD_CONVERSION + ); + +} diff --git a/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/mapping/FilterMapping.java b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/mapping/FilterMapping.java new file mode 100644 index 000000000..e109cb920 --- /dev/null +++ b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/mapping/FilterMapping.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.wayang.bigquery.mapping; + +import org.apache.wayang.basic.data.Record; +import org.apache.wayang.basic.operators.FilterOperator; +import org.apache.wayang.bigquery.operators.BigQueryFilterOperator; +import org.apache.wayang.bigquery.platform.BigQueryPlatform; +import org.apache.wayang.core.mapping.Mapping; +import org.apache.wayang.core.mapping.OperatorPattern; +import org.apache.wayang.core.mapping.PlanTransformation; +import org.apache.wayang.core.mapping.ReplacementSubplanFactory; +import org.apache.wayang.core.mapping.SubplanPattern; +import org.apache.wayang.core.types.DataSetType; + +import java.util.Collection; +import java.util.Collections; + + +/** + * Mapping from {@link FilterOperator} to {@link BigQueryFilterOperator}. + */ +@SuppressWarnings("unchecked") +public class FilterMapping implements Mapping { + + @Override + public Collection getTransformations() { + return Collections.singleton(new PlanTransformation( + this.createSubplanPattern(), + this.createReplacementSubplanFactory(), + BigQueryPlatform.getInstance() + )); + } + + private SubplanPattern createSubplanPattern() { + final OperatorPattern> operatorPattern = new OperatorPattern<>( + "filter", new FilterOperator<>(null, DataSetType.createDefault(Record.class)), false + ).withAdditionalTest(op -> op.getPredicateDescriptor().getSqlImplementation() != null); + return SubplanPattern.createSingleton(operatorPattern); + } + + private ReplacementSubplanFactory createReplacementSubplanFactory() { + return new ReplacementSubplanFactory.OfSingleOperators( + (matchedOperator, epoch) -> new BigQueryFilterOperator(matchedOperator).at(epoch) + ); + } +} diff --git a/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/mapping/GlobalReduceMapping.java b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/mapping/GlobalReduceMapping.java new file mode 100644 index 000000000..4b20ff344 --- /dev/null +++ b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/mapping/GlobalReduceMapping.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.wayang.bigquery.mapping; + +import org.apache.wayang.basic.data.Record; +import org.apache.wayang.basic.operators.GlobalReduceOperator; +import org.apache.wayang.bigquery.operators.BigQueryGlobalReduceOperator; +import org.apache.wayang.bigquery.platform.BigQueryPlatform; +import org.apache.wayang.core.mapping.Mapping; +import org.apache.wayang.core.mapping.OperatorPattern; +import org.apache.wayang.core.mapping.PlanTransformation; +import org.apache.wayang.core.mapping.ReplacementSubplanFactory; +import org.apache.wayang.core.mapping.SubplanPattern; +import org.apache.wayang.core.types.DataSetType; + +import java.util.Collection; +import java.util.Collections; + +/** + * Mapping from {@link GlobalReduceOperator} to {@link BigQueryGlobalReduceOperator}. + */ +@SuppressWarnings("unchecked") +public class GlobalReduceMapping implements Mapping { + + @Override + public Collection getTransformations() { + return Collections.singleton(new PlanTransformation( + this.createSubplanPattern(), + this.createReplacementSubplanFactory(), + BigQueryPlatform.getInstance())); + } + + private SubplanPattern createSubplanPattern() { + final OperatorPattern> operatorPattern = new OperatorPattern<>( + "reduce", new GlobalReduceOperator(null, DataSetType.createDefault(Record.class)), false) + .withAdditionalTest(op -> op.getReduceDescriptor().getSqlImplementation() != null); + return SubplanPattern.createSingleton(operatorPattern); + } + + private ReplacementSubplanFactory createReplacementSubplanFactory() { + return new ReplacementSubplanFactory.OfSingleOperators>( + (matchedOperator, epoch) -> new BigQueryGlobalReduceOperator(matchedOperator).at(epoch)); + } +} diff --git a/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/mapping/JoinMapping.java b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/mapping/JoinMapping.java new file mode 100644 index 000000000..8db353600 --- /dev/null +++ b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/mapping/JoinMapping.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.wayang.bigquery.mapping; + +import org.apache.wayang.basic.data.Record; +import org.apache.wayang.basic.operators.JoinOperator; +import org.apache.wayang.bigquery.operators.BigQueryJoinOperator; +import org.apache.wayang.bigquery.platform.BigQueryPlatform; +import org.apache.wayang.core.function.TransformationDescriptor; +import org.apache.wayang.core.mapping.Mapping; +import org.apache.wayang.core.mapping.OperatorPattern; +import org.apache.wayang.core.mapping.PlanTransformation; +import org.apache.wayang.core.mapping.ReplacementSubplanFactory; +import org.apache.wayang.core.mapping.SubplanPattern; +import org.apache.wayang.core.types.DataSetType; + +import java.util.Collection; +import java.util.Collections; + +/** + * Mapping from {@link JoinOperator} to {@link BigQueryJoinOperator}. + */ +@SuppressWarnings("unchecked") +public class JoinMapping implements Mapping { + + @Override + public Collection getTransformations() { + return Collections.singleton(new PlanTransformation( + this.createSubplanPattern(), + this.createReplacementSubplanFactory(), + BigQueryPlatform.getInstance() + )); + } + + private SubplanPattern createSubplanPattern() { + OperatorPattern> operatorPattern = new OperatorPattern<>( + "join", + new JoinOperator( + null, + null, + DataSetType.createDefault(Record.class), + DataSetType.createDefault(Record.class) + ), + false + ) + .withAdditionalTest(op -> op.getKeyDescriptor0() instanceof TransformationDescriptor) + .withAdditionalTest(op -> op.getKeyDescriptor1() instanceof TransformationDescriptor) + .withAdditionalTest(op -> op.getKeyDescriptor0().getSqlImplementation() != null) + .withAdditionalTest(op -> op.getKeyDescriptor1().getSqlImplementation() != null); + return SubplanPattern.createSingleton(operatorPattern); + } + + private ReplacementSubplanFactory createReplacementSubplanFactory() { + return new ReplacementSubplanFactory.OfSingleOperators>( + (matchedOperator, epoch) -> { + return new BigQueryJoinOperator(matchedOperator).at(epoch); + } + ); + } +} diff --git a/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/mapping/Mappings.java b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/mapping/Mappings.java new file mode 100644 index 000000000..715b4f1cd --- /dev/null +++ b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/mapping/Mappings.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.wayang.bigquery.mapping; + +import org.apache.wayang.core.mapping.Mapping; + +import java.util.Arrays; +import java.util.Collection; + +/** + * Register for the {@link Mapping}s supported for this platform. + */ +public class Mappings { + + public static final Collection ALL = Arrays.asList( + new FilterMapping(), + new GlobalReduceMapping(), + new JoinMapping(), + new ProjectionMapping(), + new ReduceByMapping(), + new SortMapping(), + new TableSinkMapping() + ); + +} diff --git a/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/mapping/ProjectionMapping.java b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/mapping/ProjectionMapping.java new file mode 100644 index 000000000..2c26e3a4b --- /dev/null +++ b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/mapping/ProjectionMapping.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.wayang.bigquery.mapping; + +import org.apache.wayang.basic.data.Record; +import org.apache.wayang.basic.function.ProjectionDescriptor; +import org.apache.wayang.basic.operators.MapOperator; +import org.apache.wayang.bigquery.operators.BigQueryProjectionOperator; +import org.apache.wayang.bigquery.platform.BigQueryPlatform; +import org.apache.wayang.core.mapping.Mapping; +import org.apache.wayang.core.mapping.OperatorPattern; +import org.apache.wayang.core.mapping.PlanTransformation; +import org.apache.wayang.core.mapping.ReplacementSubplanFactory; +import org.apache.wayang.core.mapping.SubplanPattern; +import org.apache.wayang.core.types.DataSetType; + +import java.util.Collection; +import java.util.Collections; + +/** + * Mapping from {@link MapOperator} to {@link BigQueryProjectionOperator}. + */ +public class ProjectionMapping implements Mapping { + + @Override + public Collection getTransformations() { + return Collections.singleton(new PlanTransformation( + this.createSubplanPattern(), + this.createReplacementSubplanFactory(), + BigQueryPlatform.getInstance())); + } + + private SubplanPattern createSubplanPattern() { + OperatorPattern> operatorPattern = new OperatorPattern<>( + "projection", + new MapOperator<>( + null, + DataSetType.createDefault(Record.class), + DataSetType.createDefault(Record.class)), + false) + .withAdditionalTest(op -> op.getFunctionDescriptor() instanceof ProjectionDescriptor) + .withAdditionalTest(op -> op.getNumInputs() == 1); // No broadcasts. + return SubplanPattern.createSingleton(operatorPattern); + } + + private ReplacementSubplanFactory createReplacementSubplanFactory() { + return new ReplacementSubplanFactory.OfSingleOperators>( + (matchedOperator, epoch) -> new BigQueryProjectionOperator(matchedOperator).at(epoch)); + } +} diff --git a/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/mapping/ReduceByMapping.java b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/mapping/ReduceByMapping.java new file mode 100644 index 000000000..a20c9b3cc --- /dev/null +++ b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/mapping/ReduceByMapping.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.wayang.bigquery.mapping; + +import org.apache.wayang.basic.data.Record; +import org.apache.wayang.basic.operators.ReduceByOperator; +import org.apache.wayang.bigquery.operators.BigQueryReduceByOperator; +import org.apache.wayang.bigquery.platform.BigQueryPlatform; +import org.apache.wayang.core.mapping.Mapping; +import org.apache.wayang.core.mapping.OperatorPattern; +import org.apache.wayang.core.mapping.PlanTransformation; +import org.apache.wayang.core.mapping.ReplacementSubplanFactory; +import org.apache.wayang.core.mapping.SubplanPattern; +import org.apache.wayang.core.types.DataSetType; + +import java.util.Collection; +import java.util.Collections; + +/** + * Mapping from {@link ReduceByOperator} to {@link BigQueryReduceByOperator}. + */ +@SuppressWarnings("unchecked") +public class ReduceByMapping implements Mapping { + + @Override + public Collection getTransformations() { + return Collections.singleton(new PlanTransformation( + this.createSubplanPattern(), + this.createReplacementSubplanFactory(), + BigQueryPlatform.getInstance())); + } + + private SubplanPattern createSubplanPattern() { + final OperatorPattern> operatorPattern = new OperatorPattern<>( + "reduceBy", + new ReduceByOperator(null, null, DataSetType.createDefault(Record.class)), + false) + .withAdditionalTest(op -> op.getKeyDescriptor().getSqlImplementation() != null + && op.getReduceDescriptor().getSqlImplementation() != null); + return SubplanPattern.createSingleton(operatorPattern); + } + + private ReplacementSubplanFactory createReplacementSubplanFactory() { + return new ReplacementSubplanFactory.OfSingleOperators>( + (matchedOperator, epoch) -> new BigQueryReduceByOperator(matchedOperator).at(epoch)); + } +} diff --git a/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/mapping/SortMapping.java b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/mapping/SortMapping.java new file mode 100644 index 000000000..e9f7a13e8 --- /dev/null +++ b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/mapping/SortMapping.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.wayang.bigquery.mapping; + +import org.apache.wayang.basic.data.Record; +import org.apache.wayang.basic.operators.SortOperator; +import org.apache.wayang.bigquery.operators.BigQuerySortOperator; +import org.apache.wayang.bigquery.platform.BigQueryPlatform; +import org.apache.wayang.core.mapping.Mapping; +import org.apache.wayang.core.mapping.OperatorPattern; +import org.apache.wayang.core.mapping.PlanTransformation; +import org.apache.wayang.core.mapping.ReplacementSubplanFactory; +import org.apache.wayang.core.mapping.SubplanPattern; +import org.apache.wayang.core.types.DataSetType; + +import java.util.Collection; +import java.util.Collections; + +/** + * Mapping from {@link SortOperator} to {@link BigQuerySortOperator}. + */ +@SuppressWarnings("unchecked") +public class SortMapping implements Mapping { + + @Override + public Collection getTransformations() { + return Collections.singleton(new PlanTransformation( + this.createSubplanPattern(), + this.createReplacementSubplanFactory(), + BigQueryPlatform.getInstance())); + } + + private SubplanPattern createSubplanPattern() { + final OperatorPattern> operatorPattern = new OperatorPattern<>( + "sort", + new SortOperator(null, DataSetType.createDefault(Record.class)), + false) + .withAdditionalTest(op -> op.getKeyDescriptor().getSqlImplementation() != null); + return SubplanPattern.createSingleton(operatorPattern); + } + + private ReplacementSubplanFactory createReplacementSubplanFactory() { + return new ReplacementSubplanFactory.OfSingleOperators>( + (matchedOperator, epoch) -> new BigQuerySortOperator(matchedOperator).at(epoch)); + } +} diff --git a/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/mapping/TableSinkMapping.java b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/mapping/TableSinkMapping.java new file mode 100644 index 000000000..aafaed0c8 --- /dev/null +++ b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/mapping/TableSinkMapping.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.wayang.bigquery.mapping; + +import org.apache.wayang.basic.operators.TableSink; +import org.apache.wayang.bigquery.operators.BigQueryTableSinkOperator; +import org.apache.wayang.bigquery.platform.BigQueryPlatform; +import org.apache.wayang.core.mapping.Mapping; +import org.apache.wayang.core.mapping.OperatorPattern; +import org.apache.wayang.core.mapping.PlanTransformation; +import org.apache.wayang.core.mapping.ReplacementSubplanFactory; +import org.apache.wayang.core.mapping.SubplanPattern; + +import java.util.Collection; +import java.util.Collections; + +/** + * Mapping from {@link TableSink} to {@link BigQueryTableSinkOperator}. + */ +@SuppressWarnings("unchecked") +public class TableSinkMapping implements Mapping { + + @Override + public Collection getTransformations() { + return Collections.singleton(new PlanTransformation( + this.createSubplanPattern(), + this.createReplacementSubplanFactory(), + BigQueryPlatform.getInstance())); + } + + private SubplanPattern createSubplanPattern() { + final OperatorPattern operatorPattern = new OperatorPattern<>( + "sink", new TableSink<>(null, null, null), false + ); + return SubplanPattern.createSingleton(operatorPattern); + } + + private ReplacementSubplanFactory createReplacementSubplanFactory() { + return new ReplacementSubplanFactory.OfSingleOperators( + (matchedOperator, epoch) -> new BigQueryTableSinkOperator(matchedOperator).at(epoch)); + } +} diff --git a/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/operators/BigQueryExecutionOperator.java b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/operators/BigQueryExecutionOperator.java new file mode 100644 index 000000000..a496042fb --- /dev/null +++ b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/operators/BigQueryExecutionOperator.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.wayang.bigquery.operators; + +import org.apache.wayang.bigquery.platform.BigQueryPlatform; +import org.apache.wayang.jdbc.operators.JdbcExecutionOperator; + +public interface BigQueryExecutionOperator extends JdbcExecutionOperator { + + @Override + default BigQueryPlatform getPlatform() { + return BigQueryPlatform.getInstance(); + } + +} diff --git a/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/operators/BigQueryFilterOperator.java b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/operators/BigQueryFilterOperator.java new file mode 100644 index 000000000..ee246c93d --- /dev/null +++ b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/operators/BigQueryFilterOperator.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.wayang.bigquery.operators; + +import org.apache.wayang.basic.data.Record; +import org.apache.wayang.basic.operators.FilterOperator; +import org.apache.wayang.core.function.PredicateDescriptor; +import org.apache.wayang.jdbc.operators.JdbcFilterOperator; + + +/** + * BigQuery implementation of the {@link FilterOperator}. + */ +public class BigQueryFilterOperator extends JdbcFilterOperator implements BigQueryExecutionOperator { + + /** + * Creates a new instance. + */ + public BigQueryFilterOperator(PredicateDescriptor predicateDescriptor) { + super(predicateDescriptor); + } + + /** + * Copies an instance (exclusive of broadcasts). + * + * @param that that should be copied + */ + public BigQueryFilterOperator(FilterOperator that) { + super(that); + } + + @Override + protected BigQueryFilterOperator createCopy() { + return new BigQueryFilterOperator(this); + } +} diff --git a/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/operators/BigQueryGlobalReduceOperator.java b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/operators/BigQueryGlobalReduceOperator.java new file mode 100644 index 000000000..b6b115e10 --- /dev/null +++ b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/operators/BigQueryGlobalReduceOperator.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.wayang.bigquery.operators; + +import org.apache.wayang.basic.data.Record; +import org.apache.wayang.basic.operators.GlobalReduceOperator; +import org.apache.wayang.core.function.ReduceDescriptor; +import org.apache.wayang.jdbc.operators.JdbcGlobalReduceOperator; + +/** + * BigQuery implementation of the {@link GlobalReduceOperator}. The reduction is + * pushed down as a SQL aggregate (e.g. {@code SUM(amount)}) via its + * {@code sqlImplementation}. + */ +public class BigQueryGlobalReduceOperator extends JdbcGlobalReduceOperator implements BigQueryExecutionOperator { + + public BigQueryGlobalReduceOperator(ReduceDescriptor reduceDescriptor) { + super(reduceDescriptor); + } + + /** + * Copies an instance (exclusive of broadcasts). + * + * @param that that should be copied + */ + public BigQueryGlobalReduceOperator(GlobalReduceOperator that) { + super(that); + } + + @Override + protected BigQueryGlobalReduceOperator createCopy() { + return new BigQueryGlobalReduceOperator(this); + } +} diff --git a/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/operators/BigQueryJoinOperator.java b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/operators/BigQueryJoinOperator.java new file mode 100644 index 000000000..40d444c43 --- /dev/null +++ b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/operators/BigQueryJoinOperator.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.wayang.bigquery.operators; + +import org.apache.wayang.basic.data.Record; +import org.apache.wayang.basic.operators.JoinOperator; +import org.apache.wayang.core.function.TransformationDescriptor; +import org.apache.wayang.jdbc.operators.JdbcJoinOperator; + + +/** + * BigQuery implementation of the {@link JoinOperator}. + */ +public class BigQueryJoinOperator extends JdbcJoinOperator implements BigQueryExecutionOperator { + + /** + * Creates a new instance. + */ + public BigQueryJoinOperator( + TransformationDescriptor keyDescriptor0, + TransformationDescriptor keyDescriptor1) { + super(keyDescriptor0,keyDescriptor1); + } + + public BigQueryJoinOperator(JoinOperator that) { + super(that); + } + + @Override + protected BigQueryJoinOperator createCopy() { + return new BigQueryJoinOperator(this); + } +} diff --git a/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/operators/BigQueryProjectionOperator.java b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/operators/BigQueryProjectionOperator.java new file mode 100644 index 000000000..6cd0b538e --- /dev/null +++ b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/operators/BigQueryProjectionOperator.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.wayang.bigquery.operators; + +import org.apache.wayang.basic.data.Record; +import org.apache.wayang.basic.function.ProjectionDescriptor; +import org.apache.wayang.basic.operators.FilterOperator; +import org.apache.wayang.basic.operators.MapOperator; +import org.apache.wayang.jdbc.operators.JdbcProjectionOperator; + +/** + * BigQuery implementation of the {@link FilterOperator}. + */ +public class BigQueryProjectionOperator extends JdbcProjectionOperator implements BigQueryExecutionOperator { + + public BigQueryProjectionOperator(String... fieldNames) { + super(fieldNames); + } + + public BigQueryProjectionOperator(ProjectionDescriptor functionDescriptor) { + super(functionDescriptor); + } + + public BigQueryProjectionOperator(MapOperator that) { + super(that); + } + + @Override + protected BigQueryProjectionOperator createCopy() { + return new BigQueryProjectionOperator(this); + } + +} diff --git a/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/operators/BigQueryReduceByOperator.java b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/operators/BigQueryReduceByOperator.java new file mode 100644 index 000000000..cacf9dcaa --- /dev/null +++ b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/operators/BigQueryReduceByOperator.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.wayang.bigquery.operators; + +import org.apache.wayang.basic.data.Record; +import org.apache.wayang.basic.operators.ReduceByOperator; +import org.apache.wayang.core.function.ReduceDescriptor; +import org.apache.wayang.core.function.TransformationDescriptor; +import org.apache.wayang.jdbc.operators.JdbcReduceByOperator; + +/** + * BigQuery implementation of the {@link ReduceByOperator}. The grouping key and + * the reduction are pushed down as a SQL {@code GROUP BY} plus aggregate (e.g. + * {@code SELECT region, SUM(amount) ... GROUP BY region}). + */ +public class BigQueryReduceByOperator extends JdbcReduceByOperator implements BigQueryExecutionOperator { + + public BigQueryReduceByOperator(TransformationDescriptor keyDescriptor, + ReduceDescriptor reduceDescriptor) { + super(keyDescriptor, reduceDescriptor); + } + + /** + * Copies an instance (exclusive of broadcasts). + * + * @param that that should be copied + */ + public BigQueryReduceByOperator(ReduceByOperator that) { + super(that); + } + + @Override + protected BigQueryReduceByOperator createCopy() { + return new BigQueryReduceByOperator(this); + } +} diff --git a/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/operators/BigQuerySortOperator.java b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/operators/BigQuerySortOperator.java new file mode 100644 index 000000000..2aea82627 --- /dev/null +++ b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/operators/BigQuerySortOperator.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.wayang.bigquery.operators; + +import org.apache.wayang.basic.data.Record; +import org.apache.wayang.basic.operators.SortOperator; +import org.apache.wayang.core.function.TransformationDescriptor; +import org.apache.wayang.jdbc.operators.JdbcSortOperator; + +/** + * BigQuery implementation of the {@link SortOperator}. The sort key and direction + * are pushed down as a SQL {@code ORDER BY} clause via its {@code sqlImplementation}. + */ +public class BigQuerySortOperator extends JdbcSortOperator implements BigQueryExecutionOperator { + + public BigQuerySortOperator(TransformationDescriptor keyDescriptor) { + super(keyDescriptor); + } + + /** + * Copies an instance (exclusive of broadcasts). + * + * @param that that should be copied + */ + public BigQuerySortOperator(SortOperator that) { + super(that); + } + + @Override + protected BigQuerySortOperator createCopy() { + return new BigQuerySortOperator(this); + } +} diff --git a/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/operators/BigQueryTableSinkOperator.java b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/operators/BigQueryTableSinkOperator.java new file mode 100644 index 000000000..c7c065013 --- /dev/null +++ b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/operators/BigQueryTableSinkOperator.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.wayang.bigquery.operators; + +import org.apache.wayang.basic.data.Record; +import org.apache.wayang.basic.operators.TableSink; +import org.apache.wayang.jdbc.operators.JdbcTableSinkOperator; + +/** + * BigQuery implementation of the {@link JdbcTableSinkOperator}. The sink stays + * entirely within BigQuery: the composed query is wrapped in a + * {@code CREATE TABLE ... AS} (mode {@code overwrite}) or {@code INSERT INTO ...} + * statement. + * + *

Table names follow BigQuery's backtick-quoted convention + * {@code `project.dataset.table`}. + */ +public class BigQueryTableSinkOperator extends JdbcTableSinkOperator implements BigQueryExecutionOperator { + + public BigQueryTableSinkOperator(String tableName, String[] columnNames) { + super(tableName, columnNames); + } + + /** + * Copies an instance (exclusive of broadcasts). + * + * @param that that should be copied + */ + public BigQueryTableSinkOperator(TableSink that) { + super(that); + } +} diff --git a/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/operators/BigQueryTableSource.java b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/operators/BigQueryTableSource.java new file mode 100644 index 000000000..2d71d3746 --- /dev/null +++ b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/operators/BigQueryTableSource.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.wayang.bigquery.operators; + +import org.apache.wayang.basic.operators.TableSource; +import org.apache.wayang.core.platform.ChannelDescriptor; +import org.apache.wayang.jdbc.operators.JdbcTableSource; + +import java.util.List; + +/** + * BigQuery implementation for the {@link TableSource}. + * + *

Table names must be backtick-quoted and fully qualified: + * {@code `project.dataset.table`}. Pass the backtick-quoted name as the + * {@code tableName} constructor argument. + */ +public class BigQueryTableSource extends JdbcTableSource implements BigQueryExecutionOperator { + + /** + * Creates a new instance. + * + * @see TableSource#TableSource(String, String...) + */ + public BigQueryTableSource(String tableName, String... columnNames) { + super(tableName, columnNames); + } + + /** + * Copies an instance (exclusive of broadcasts). + * + * @param that that should be copied + */ + public BigQueryTableSource(JdbcTableSource that) { + super(that); + } + + @Override + public List getSupportedInputChannels(int index) { + throw new UnsupportedOperationException("This operator has no input channels."); + } +} diff --git a/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/platform/BigQueryPlatform.java b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/platform/BigQueryPlatform.java new file mode 100644 index 000000000..8ab7c036d --- /dev/null +++ b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/platform/BigQueryPlatform.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.wayang.bigquery.platform; + +import org.apache.wayang.core.platform.Platform; +import org.apache.wayang.jdbc.platform.JdbcPlatformTemplate; + +/** + * {@link Platform} implementation for BigQuery. + * + *

BigQuery JDBC URL format: + *

+ *   jdbc:bigquery://https://www.googleapis.com/bigquery/v2;
+ *     ProjectId=my-project;
+ *     OAuthType=0;
+ *     OAuthServiceAcctEmail=sa@my-project.iam.gserviceaccount.com;
+ *     OAuthPvtKeyPath=/path/to/key.json
+ * 
+ * + *

Table names must be backtick-quoted: {@code `project.dataset.table`}. + */ +public class BigQueryPlatform extends JdbcPlatformTemplate { + + private static final String PLATFORM_NAME = "BigQuery"; + + private static final String CONFIG_NAME = "bigquery"; + + private static BigQueryPlatform instance = null; + + public static BigQueryPlatform getInstance() { + if (instance == null) { + instance = new BigQueryPlatform(); + } + return instance; + } + + protected BigQueryPlatform() { + super(PLATFORM_NAME, CONFIG_NAME); + } + + @Override + public String getJdbcDriverClassName() { + return "com.google.cloud.bigquery.jdbc.BigQueryDriver"; + } + +} diff --git a/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/plugin/BigQueryConversionsPlugin.java b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/plugin/BigQueryConversionsPlugin.java new file mode 100644 index 000000000..d828489ad --- /dev/null +++ b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/plugin/BigQueryConversionsPlugin.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.wayang.bigquery.plugin; + +import org.apache.wayang.bigquery.channels.ChannelConversions; +import org.apache.wayang.bigquery.platform.BigQueryPlatform; +import org.apache.wayang.core.api.Configuration; +import org.apache.wayang.core.mapping.Mapping; +import org.apache.wayang.core.optimizer.channels.ChannelConversion; +import org.apache.wayang.core.plan.wayangplan.Operator; +import org.apache.wayang.core.platform.Platform; +import org.apache.wayang.core.plugin.Plugin; +import org.apache.wayang.java.platform.JavaPlatform; + +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; + +/** + * This {@link Plugin} enables to use some basic Wayang {@link Operator}s on the {@link BigQueryPlatform}. + */ +public class BigQueryConversionsPlugin implements Plugin { + + @Override + public Collection getRequiredPlatforms() { + return Arrays.asList(BigQueryPlatform.getInstance(), JavaPlatform.getInstance()); + } + + @Override + public Collection getMappings() { + return Collections.emptyList(); + } + + @Override + public Collection getChannelConversions() { + return ChannelConversions.ALL; + } + + @Override + public void setProperties(Configuration configuration) { + } +} diff --git a/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/plugin/BigQueryPlugin.java b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/plugin/BigQueryPlugin.java new file mode 100644 index 000000000..cf4dc3863 --- /dev/null +++ b/wayang-platforms/wayang-bigquery/src/main/java/org/apache/wayang/bigquery/plugin/BigQueryPlugin.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.wayang.bigquery.plugin; + +import org.apache.wayang.bigquery.channels.ChannelConversions; +import org.apache.wayang.bigquery.mapping.Mappings; +import org.apache.wayang.bigquery.platform.BigQueryPlatform; +import org.apache.wayang.core.api.Configuration; +import org.apache.wayang.core.mapping.Mapping; +import org.apache.wayang.core.optimizer.channels.ChannelConversion; +import org.apache.wayang.core.plan.wayangplan.Operator; +import org.apache.wayang.core.platform.Platform; +import org.apache.wayang.core.plugin.Plugin; +import org.apache.wayang.java.platform.JavaPlatform; + +import java.util.Arrays; +import java.util.Collection; + +/** + * This {@link Plugin} enables to use some basic Wayang {@link Operator}s on the {@link BigQueryPlatform}. + */ +public class BigQueryPlugin implements Plugin { + + @Override + public Collection getRequiredPlatforms() { + return Arrays.asList(BigQueryPlatform.getInstance(), JavaPlatform.getInstance()); + } + + @Override + public Collection getMappings() { + return Mappings.ALL; + } + + @Override + public Collection getChannelConversions() { + return ChannelConversions.ALL; + } + + @Override + public void setProperties(Configuration configuration) { + } +} diff --git a/wayang-platforms/wayang-bigquery/src/main/resources/wayang-bigquery-defaults.properties b/wayang-platforms/wayang-bigquery/src/main/resources/wayang-bigquery-defaults.properties new file mode 100644 index 000000000..ce1a986d8 --- /dev/null +++ b/wayang-platforms/wayang-bigquery/src/main/resources/wayang-bigquery-defaults.properties @@ -0,0 +1,188 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# JDBC driver (loaded via reflection — no compile-time dependency needed) +wayang.bigquery.jdbc.driverName = com.google.cloud.bigquery.jdbc.BigQueryDriver + +# Connection URL and credentials are deployment-specific. +# Set these in your wayang.properties or programmatically via Configuration. +# +# Example: +# wayang.bigquery.jdbc.url = jdbc:bigquery://https://www.googleapis.com/bigquery/v2;\ +# ProjectId=my-project;\ +# OAuthType=0;\ +# OAuthServiceAcctEmail=sa@my-project.iam.gserviceaccount.com;\ +# OAuthPvtKeyPath=/path/to/key.json +# +# wayang.bigquery.jdbc.url = (required — set per deployment) +# wayang.bigquery.jdbc.user = (optional) +# wayang.bigquery.jdbc.password = (optional) + +# ── Hardware profile ────────────────────────────────────────────────────────── +# BigQuery is serverless and runs on Google's shared compute. +# Model enough cores for full parallelism; latency is dominated by network +# and query dispatch rather than raw CPU. +wayang.bigquery.cpu.mhz = 2700 +wayang.bigquery.cores = 8 +wayang.bigquery.costs.fix = 0.0 +wayang.bigquery.costs.per-ms = 1.0 + +# ── Cost model ──────────────────────────────────────────────────────────────── +# +# Formula: cpu = α * rows + β +# +# BigQuery is a serverless, massively parallel columnar engine. +# Per-row cost (α) is very low because scans run across thousands of slots +# in parallel. Fixed startup (β) is high due to query dispatch, planning, +# billing overhead, and result serialisation back over the network. +# +# Compared to single-node sources: +# α = 5 — massively parallel; per-row overhead ~10× lower than Postgres +# β = 2000000 — serverless dispatch + billing + network round-trip +# +# Optimizer crossover points: +# BigQuery vs Postgres: 5n+2M < 55n+380k → n > ~32k rows +# BigQuery vs Trino: 5n+2M < 10n+800k → n > ~240k rows +# +# These are initial estimates. Tune by running the .load.template variants +# and fitting measured data. +# ────────────────────────────────────────────────────────────────────────────── + +wayang.bigquery.tablesource.load.template = {\ + "type":"mathex", "in":0, "out":1,\ + "cpu":"?*out0 + ?",\ + "ram":"0",\ + "p":0.9\ +} +wayang.bigquery.tablesource.load = {\ + "in":0, "out":1,\ + "cpu":"${5*out0 + 2000000}",\ + "ram":"0",\ + "p":0.9\ +} + +wayang.bigquery.filter.load.template = {\ + "type":"mathex", "in":1, "out":1,\ + "cpu":"?*in0 + ?",\ + "ram":"0",\ + "p":0.9\ +} +wayang.bigquery.filter.load = {\ + "in":1, "out":1,\ + "cpu":"${5*in0 + 2000000}",\ + "ram":"0",\ + "p":0.9\ +} + +wayang.bigquery.projection.load.template = {\ + "type":"mathex", "in":1, "out":1,\ + "cpu":"?*in0 + ?",\ + "ram":"0",\ + "p":0.9\ +} +wayang.bigquery.projection.load = {\ + "in":1, "out":1,\ + "cpu":"${5*in0 + 2000000}",\ + "ram":"0",\ + "p":0.9\ +} + +wayang.bigquery.join.load.template = {\ + "type":"mathex", "in":2, "out":1,\ + "cpu":"?*in0 + ?*in1 + ?",\ + "ram":"0",\ + "p":0.9\ +} +wayang.bigquery.join.load = {\ + "in":2, "out":1,\ + "cpu":"${5*in0 + 5*in1 + 2000000}",\ + "ram":"0",\ + "p":0.9\ +} + +wayang.bigquery.globalreduce.load.template = {\ + "type":"mathex", "in":1, "out":1,\ + "cpu":"?*in0 + ?",\ + "ram":"0",\ + "p":0.9\ +} +wayang.bigquery.globalreduce.load = {\ + "in":1, "out":1,\ + "cpu":"${5*in0 + 2000000}",\ + "ram":"0",\ + "p":0.9\ +} + +wayang.bigquery.reduceby.load.template = {\ + "type":"mathex", "in":1, "out":1,\ + "cpu":"?*in0 + ?",\ + "ram":"0",\ + "p":0.9\ +} +wayang.bigquery.reduceby.load = {\ + "in":1, "out":1,\ + "cpu":"${5*in0 + 2000000}",\ + "ram":"0",\ + "p":0.9\ +} + +wayang.bigquery.sort.load.template = {\ + "type":"mathex", "in":1, "out":1,\ + "cpu":"?*in0 + ?",\ + "ram":"0",\ + "p":0.9\ +} +wayang.bigquery.sort.load = {\ + "in":1, "out":1,\ + "cpu":"${5*in0 + 2000000}",\ + "ram":"0",\ + "p":0.9\ +} + +wayang.bigquery.tablesink.load.template = {\ + "type":"mathex", "in":1, "out":0,\ + "cpu":"?*in0 + ?",\ + "ram":"0",\ + "p":0.9\ +} +wayang.bigquery.tablesink.load = {\ + "in":1, "out":0,\ + "cpu":"${5*in0 + 2000000}",\ + "ram":"0",\ + "p":0.9\ +} + +wayang.bigquery.sqltostream.load.query.template = {\ + "type":"mathex", "in":1, "out":1,\ + "cpu":"?*out0 + ?"\ +} +wayang.bigquery.sqltostream.load.query = {\ + "in":1, "out":1,\ + "cpu":"${5*out0 + 2000000}",\ + "ram":"0",\ + "p":0.9\ +} +wayang.bigquery.sqltostream.load.output.template = {\ + "type":"mathex", "in":1, "out":1,\ + "cpu":"?*out0"\ +} +wayang.bigquery.sqltostream.load.output = {\ + "in":1, "out":1,\ + "cpu":"${5*out0}",\ + "ram":"0",\ + "p":0.9\ +} diff --git a/wayang-platforms/wayang-bigquery/src/test/java/org/apache/wayang/bigquery/BigQueryOperatorsIT.java b/wayang-platforms/wayang-bigquery/src/test/java/org/apache/wayang/bigquery/BigQueryOperatorsIT.java new file mode 100644 index 000000000..bc96addf5 --- /dev/null +++ b/wayang-platforms/wayang-bigquery/src/test/java/org/apache/wayang/bigquery/BigQueryOperatorsIT.java @@ -0,0 +1,599 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.wayang.bigquery; + +import org.apache.wayang.api.DataQuantaBuilder; +import org.apache.wayang.api.JavaPlanBuilder; +import org.apache.wayang.basic.data.Record; +import org.apache.wayang.basic.data.Tuple2; +import org.apache.wayang.basic.function.ProjectionDescriptor; +import org.apache.wayang.basic.operators.FilterOperator; +import org.apache.wayang.basic.operators.GlobalReduceOperator; +import org.apache.wayang.basic.operators.JoinOperator; +import org.apache.wayang.basic.operators.MapOperator; +import org.apache.wayang.basic.operators.ReduceByOperator; +import org.apache.wayang.basic.operators.SortOperator; +import org.apache.wayang.basic.operators.TableSink; +import org.apache.wayang.basic.types.RecordType; +import org.apache.wayang.core.api.Configuration; +import org.apache.wayang.core.api.WayangContext; +import org.apache.wayang.core.function.FunctionDescriptor; +import org.apache.wayang.core.function.PredicateDescriptor; +import org.apache.wayang.core.function.ReduceDescriptor; +import org.apache.wayang.core.function.TransformationDescriptor; +import org.apache.wayang.core.mapping.Mapping; +import org.apache.wayang.core.mapping.OperatorPattern; +import org.apache.wayang.core.mapping.PlanTransformation; +import org.apache.wayang.core.mapping.ReplacementSubplanFactory; +import org.apache.wayang.core.mapping.SubplanPattern; +import org.apache.wayang.core.plan.wayangplan.WayangPlan; +import org.apache.wayang.core.types.DataSetType; +import org.apache.wayang.core.types.DataUnitType; +import org.apache.wayang.bigquery.operators.BigQueryProjectionOperator; +import org.apache.wayang.bigquery.operators.BigQueryTableSource; +import org.apache.wayang.bigquery.platform.BigQueryPlatform; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assumptions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.MethodOrderer; +import org.junit.jupiter.api.Order; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestMethodOrder; + +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.ResultSet; +import java.sql.Statement; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.Properties; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Engine-only end-to-end integration tests for every operator the BigQuery + * platform implements, driven through the Wayang API against real BigQuery. + * + *

Coverage: {@code TableSource}, {@code Filter}, {@code Projection}, + * {@code Join}, {@code GlobalReduce}, {@code ReduceBy}, {@code Sort}, + * {@code TableSink}. Every Wayang plan ends in a BigQuery {@code TableSink} that + * compiles to {@code CREATE TABLE `proj.ds.t` AS SELECT ...} executed inside + * BigQuery. Only {@code BigQuery.plugin()} is registered; there is no + * {@code Java.basicPlugin()}, so the optimizer has no Java operators to fall back + * to and the whole plan necessarily runs in BigQuery. Assertions re-query the + * sink table via plain JDBC only after {@code execute(...)} returns; the sink + * table's existence + contents prove the CTAS ran in BigQuery. + * + *

The source tables are created from inline literals in {@link #setUp()}, so + * no external BigQuery dataset or table is required. Requires a live GCP project + * + service account (the JDBC driver mandates OAuth2; the local emulator cannot + * serve it). + */ +@TestMethodOrder(MethodOrderer.OrderAnnotation.class) +class BigQueryOperatorsIT { + + private static final String PROJECT_ID = cfg("bigquery.project", "BIGQUERY_PROJECT", "your-project"); + private static final String SA_EMAIL = cfg("bigquery.saEmail", "BIGQUERY_SA_EMAIL", + "wayang-bq@" + PROJECT_ID + ".iam.gserviceaccount.com"); + private static final String KEY_PATH = cfg("bigquery.keyPath", "BIGQUERY_KEY_PATH", + System.getProperty("user.home") + "/wayang-bq-key.json"); + private static final String LOCATION = cfg("bigquery.location", "BIGQUERY_LOCATION", "US"); + private static final String DATASET = cfg("bigquery.dataset", "BIGQUERY_DATASET", "wayang_it"); + + private static final String TABLE = tableName("orders"); + private static final String SINK_TABLE = tableName("operator_result"); + private static final String JOIN_TABLE = tableName("regions"); + private static final String[] JOIN_COLUMNS = {"order_id", "region", "product", "amount", "region_name"}; + private static final String JOIN_FLATTEN_NAME = "BigQuery test-only join flatten"; + + private static final String JDBC_URL = String.format( + "jdbc:bigquery://https://www.googleapis.com/bigquery/v2;" + + "ProjectId=%s;OAuthType=0;OAuthServiceAcctEmail=%s;OAuthPvtKeyPath=%s;Location=%s", + PROJECT_ID, SA_EMAIL, KEY_PATH, LOCATION); + + private static boolean available = false; + + private static String cfg(String sysProp, String envVar, String dflt) { + String v = System.getProperty(sysProp); + if (v == null || v.isEmpty()) v = System.getenv(envVar); + return (v == null || v.isEmpty()) ? dflt : v; + } + + private static String tableName(String table) { + return "`" + PROJECT_ID + "." + DATASET + "." + table + "`"; + } + + private static void createFixtureTables(Connection conn) throws Exception { + try (Statement st = conn.createStatement()) { + st.execute("CREATE SCHEMA IF NOT EXISTS `" + PROJECT_ID + "." + DATASET + "` " + + "OPTIONS(location='" + LOCATION + "')"); + st.execute("DROP TABLE IF EXISTS " + SINK_TABLE); + st.execute("DROP TABLE IF EXISTS " + JOIN_TABLE); + st.execute("DROP TABLE IF EXISTS " + TABLE); + st.execute("CREATE TABLE " + TABLE + " AS " + + "SELECT * FROM UNNEST([" + + "STRUCT(1 AS order_id, 'APAC' AS region, 'Widget A' AS product, 1500.0 AS amount)," + + "STRUCT(2 AS order_id, 'EMEA' AS region, 'Widget B' AS product, 800.5 AS amount)," + + "STRUCT(3 AS order_id, 'AMER' AS region, 'Widget A' AS product, 2200.0 AS amount)," + + "STRUCT(4 AS order_id, 'APAC' AS region, 'Widget C' AS product, 350.75 AS amount)," + + "STRUCT(5 AS order_id, 'EMEA' AS region, 'Widget A' AS product, 1100.0 AS amount)," + + "STRUCT(6 AS order_id, 'AMER' AS region, 'Widget B' AS product, 950.25 AS amount)," + + "STRUCT(7 AS order_id, 'APAC' AS region, 'Widget B' AS product, 1750.0 AS amount)," + + "STRUCT(8 AS order_id, 'EMEA' AS region, 'Widget C' AS product, 420.0 AS amount)," + + "STRUCT(9 AS order_id, 'AMER' AS region, 'Widget C' AS product, 680.5 AS amount)," + + "STRUCT(10 AS order_id, 'APAC' AS region, 'Widget A' AS product, 3000.0 AS amount)" + + "])"); + // Lookup table for the join tests; region_name avoids duplicate + // region columns in the flattened CREATE TABLE AS SELECT. + st.execute("CREATE TABLE " + JOIN_TABLE + + " AS SELECT DISTINCT region AS region_name FROM " + TABLE); + } + } + + // Lifecycle + + @BeforeAll + static void setUp() { + try { + Class.forName("com.google.cloud.bigquery.jdbc.BigQueryDriver"); + try (Connection conn = DriverManager.getConnection(JDBC_URL)) { + ResultSet rs = conn.createStatement().executeQuery("SELECT 1"); + available = rs.next(); + createFixtureTables(conn); + System.out.println("[SETUP] Connected to BigQuery project: " + PROJECT_ID); + } + } catch (Exception e) { + System.err.println("[SETUP] BigQuery not available; all tests will be skipped: " + e.getMessage()); + } + } + + @AfterAll + static void cleanup() { + if (!available) return; + try (Connection conn = DriverManager.getConnection(JDBC_URL)) { + conn.createStatement().execute("DROP TABLE IF EXISTS " + SINK_TABLE); + conn.createStatement().execute("DROP TABLE IF EXISTS " + JOIN_TABLE); + conn.createStatement().execute("DROP TABLE IF EXISTS " + TABLE); + } catch (Exception e) { + System.err.println("[CLEANUP] failed: " + e.getMessage()); + } + } + + // Tests (one per operator) + + @Test + @Order(1) + @DisplayName("BigQuery engine-only: TableSource -> TableSink") + void tableSource() { + Assumptions.assumeTrue(available, "BigQuery not available"); + BigQueryTableSource src = new BigQueryTableSource(TABLE, "order_id", "region", "product", "amount"); + TableSink sink = tableSink("order_id", "region", "product", "amount"); + src.connectTo(0, sink, 0); + + wayangContext().execute("BQ-TableSource", new WayangPlan(sink)); + + assertEquals(10, queryLong("SELECT count(*) FROM " + SINK_TABLE), "all 10 orders expected"); + } + + @Test + @Order(2) + @DisplayName("BigQuery engine-only: Filter -> TableSink") + void filter() { + Assumptions.assumeTrue(available, "BigQuery not available"); + BigQueryTableSource src = new BigQueryTableSource(TABLE, "order_id", "region", "product", "amount"); + FilterOperator filter = new FilterOperator<>( + new PredicateDescriptor<>( + (Record r) -> "EMEA".equals(r.getField(1)), Record.class + ).withSqlImplementation("region = 'EMEA'")); + TableSink sink = tableSink("order_id", "region", "product", "amount"); + src.connectTo(0, filter, 0); + filter.connectTo(0, sink, 0); + + wayangContext().execute("BQ-Filter", new WayangPlan(sink)); + + assertEquals(3, queryLong("SELECT count(*) FROM " + SINK_TABLE), "3 EMEA orders expected"); + assertEquals(0, queryLong("SELECT COUNTIF(region != 'EMEA') FROM " + SINK_TABLE), "only EMEA rows"); + } + + @Test + @Order(3) + @DisplayName("BigQuery engine-only: Projection -> TableSink") + void projection() { + Assumptions.assumeTrue(available, "BigQuery not available"); + BigQueryTableSource src = new BigQueryTableSource(TABLE, "order_id", "region", "product", "amount"); + FilterOperator filter = new FilterOperator<>( + new PredicateDescriptor<>( + (Record r) -> "EMEA".equals(r.getField(1)), Record.class + ).withSqlImplementation("region = 'EMEA'")); + MapOperator projection = new MapOperator<>( + ProjectionDescriptor.createForRecords( + new RecordType("order_id", "region", "product", "amount"), + "region", "amount"), + DataSetType.createDefault(Record.class), + DataSetType.createDefault(Record.class)); + TableSink sink = tableSink("region", "amount"); + src.connectTo(0, filter, 0); + filter.connectTo(0, projection, 0); + projection.connectTo(0, sink, 0); + + wayangContext().execute("BQ-Projection", new WayangPlan(sink)); + + assertEquals(3, queryLong("SELECT count(*) FROM " + SINK_TABLE), "3 EMEA rows expected"); + assertEquals(2, columnCount(SINK_TABLE), "projection keeps only 2 columns"); + } + + @Test + @Order(4) + @DisplayName("BigQuery engine-only: Join -> TableSink") + void join() { + Assumptions.assumeTrue(available, "BigQuery not available"); + BigQueryTableSource orders = new BigQueryTableSource(TABLE, "order_id", "region", "product", "amount"); + BigQueryTableSource regions = new BigQueryTableSource(JOIN_TABLE, "region_name"); + JoinOperator join = new JoinOperator<>( + new TransformationDescriptor<>( + (Record r) -> new Record(r.getField(1)), Record.class, Record.class + ).withSqlImplementation(TABLE, "region"), + new TransformationDescriptor<>( + (Record r) -> new Record(r.getField(0)), Record.class, Record.class + ).withSqlImplementation(JOIN_TABLE, "region_name")); + MapOperator, Record> flatten = joinFlattenOperator(); + TableSink sink = tableSink(JOIN_COLUMNS); + orders.connectTo(0, join, 0); + regions.connectTo(0, join, 1); + join.connectTo(0, flatten, 0); + flatten.connectTo(0, sink, 0); + + wayangContext().execute("BQ-Join", new WayangPlan(sink)); + + assertEquals(10, queryLong("SELECT count(*) FROM " + SINK_TABLE), + "join yields one row per order (every region exists)"); + assertEquals(0, queryLong("SELECT COUNTIF(region != region_name) FROM " + SINK_TABLE), + "joined regions should match"); + } + + @Test + @Order(5) + @DisplayName("BigQuery engine-only: GlobalReduce -> TableSink") + void globalReduce() { + Assumptions.assumeTrue(available, "BigQuery not available"); + BigQueryTableSource src = new BigQueryTableSource(TABLE, "order_id", "region", "product", "amount"); + GlobalReduceOperator reduce = new GlobalReduceOperator<>( + new ReduceDescriptor<>((a, b) -> a, Record.class) + .withSqlImplementation("SUM(amount) AS total_amount"), + DataSetType.createDefault(Record.class)); + TableSink sink = tableSink("total_amount"); + src.connectTo(0, reduce, 0); + reduce.connectTo(0, sink, 0); + + wayangContext().execute("BQ-GlobalReduce", new WayangPlan(sink)); + + assertSingleDoubleResult(12752.0, "global reduce collapses to a single SUM row"); + } + + @Test + @Order(6) + @DisplayName("BigQuery engine-only: ReduceBy -> TableSink") + void reduceBy() { + Assumptions.assumeTrue(available, "BigQuery not available"); + BigQueryTableSource src = new BigQueryTableSource(TABLE, "order_id", "region", "product", "amount"); + ReduceByOperator reduceBy = new ReduceByOperator<>( + new TransformationDescriptor<>( + (Record r) -> new Record(r.getField(1)), Record.class, Record.class + ).withSqlImplementation("region", "region"), + new ReduceDescriptor<>((a, b) -> a, Record.class) + .withSqlImplementation("SUM(amount) AS total_amount"), + DataSetType.createDefault(Record.class)); + TableSink sink = tableSink("region", "total_amount"); + src.connectTo(0, reduceBy, 0); + reduceBy.connectTo(0, sink, 0); + + wayangContext().execute("BQ-ReduceBy", new WayangPlan(sink)); + + Map sums = readRegionSums(); + assertEquals(3, sums.size(), "one row per region expected"); + assertEquals(3830.75, sums.get("AMER"), 0.01); + assertEquals(2320.5, sums.get("EMEA"), 0.01); + assertEquals(6600.75, sums.get("APAC"), 0.01); + } + + @Test + @Order(7) + @DisplayName("BigQuery engine-only: Sort -> TableSink") + void sort() { + Assumptions.assumeTrue(available, "BigQuery not available"); + BigQueryTableSource src = new BigQueryTableSource(TABLE, "order_id", "region", "product", "amount"); + SortOperator sort = new SortOperator<>( + new TransformationDescriptor<>( + (Record r) -> new Record(r.getField(3)), Record.class, Record.class + ).withSqlImplementation("amount", "ASC"), + DataSetType.createDefault(Record.class)); + TableSink sink = tableSink("order_id", "region", "product", "amount"); + src.connectTo(0, sort, 0); + sort.connectTo(0, sink, 0); + + wayangContext().execute("BQ-Sort", new WayangPlan(sink)); + + assertEquals(10, queryLong("SELECT count(*) FROM " + SINK_TABLE), "sort preserves cardinality"); + assertEquals(350.75, queryDouble("SELECT min(amount) FROM " + SINK_TABLE), 0.001); + assertEquals(3000.0, queryDouble("SELECT max(amount) FROM " + SINK_TABLE), 0.001); + } + + @Test + @Order(8) + @DisplayName("BigQuery engine-only: TableSink (filter -> CREATE TABLE AS SELECT)") + void tableSink() { + Assumptions.assumeTrue(available, "BigQuery not available"); + BigQueryTableSource src = new BigQueryTableSource(TABLE, "order_id", "region", "product", "amount"); + FilterOperator filter = new FilterOperator<>( + new PredicateDescriptor<>( + (Record r) -> "EMEA".equals(r.getField(1)), Record.class + ).withSqlImplementation("region = 'EMEA'")); + TableSink sink = new TableSink<>( + new Properties(), "overwrite", SINK_TABLE, + "order_id", "region", "product", "amount"); + src.connectTo(0, filter, 0); + filter.connectTo(0, sink, 0); + + wayangContext().execute("BQ-TableSink", new WayangPlan(sink)); + + assertEquals(3, queryLong("SELECT count(*) FROM " + SINK_TABLE), "sink holds all 3 EMEA orders"); + assertEquals(0, queryLong("SELECT COUNTIF(region != 'EMEA') FROM " + SINK_TABLE), "only EMEA rows"); + } + + // JavaPlanBuilder combination tests + + @Test + @Order(9) + @DisplayName("BigQuery engine-only JavaPlanBuilder: readTable -> filter -> projection -> tableSink") + void javaPlanBuilderReadTableFilterProjection() { + Assumptions.assumeTrue(available, "BigQuery not available"); + + new JavaPlanBuilder(wayangContext(), "BigQuery JavaPlanBuilder filter projection test") + .readTable(new BigQueryTableSource(TABLE, "order_id", "region", "product", "amount")) + .filter(record -> "EMEA".equals(record.getField(1))) + .withSqlUdf("region = 'EMEA'") + .asRecords() + .projectRecords(new String[]{"order_id", "amount"}) + .writeTable(SINK_TABLE, "overwrite", new String[]{"order_id", "amount"}, new Properties()); + + assertEquals(3, queryLong("SELECT count(*) FROM " + SINK_TABLE), "3 projected EMEA orders expected"); + assertEquals(2, columnCount(SINK_TABLE), "projection keeps only 2 columns"); + } + + @Test + @Order(10) + @DisplayName("BigQuery engine-only JavaPlanBuilder: readTable -> filter -> globalReduce -> tableSink") + void javaPlanBuilderReadTableFilterGlobalReduce() { + Assumptions.assumeTrue(available, "BigQuery not available"); + + new JavaPlanBuilder(wayangContext(), "BigQuery JavaPlanBuilder global reduce test") + .readTable(new BigQueryTableSource(TABLE, "order_id", "region", "product", "amount")) + .filter(record -> "EMEA".equals(record.getField(1))) + .withSqlUdf("region = 'EMEA'") + .reduce((left, right) -> left) + .withSqlUdf("SUM(amount) AS total_amount") + .writeTable(SINK_TABLE, "overwrite", new String[]{"total_amount"}, new Properties()); + + assertSingleDoubleResult(2320.5, "global reduction over EMEA should return one row"); + } + + @Test + @Order(11) + @DisplayName("BigQuery engine-only JavaPlanBuilder: readTable -> reduceByKey -> sort -> tableSink") + void javaPlanBuilderReadTableReduceBySort() { + Assumptions.assumeTrue(available, "BigQuery not available"); + + new JavaPlanBuilder(wayangContext(), "BigQuery JavaPlanBuilder reduce-by sort test") + .readTable(new BigQueryTableSource(TABLE, "order_id", "region", "product", "amount")) + .reduceByKey( + record -> new Record(record.getField(1)), + (left, right) -> left) + .withSqlUdfs("region", "SUM(amount) AS total_amount") + .sort(record -> new Record(record.getField(0))) + .withSqlUdf("region", "ASC") + .writeTable(SINK_TABLE, "overwrite", new String[]{"region", "total_amount"}, new Properties()); + + Map sums = readRegionSums(); + assertEquals(3, sums.size(), "one row per region expected"); + assertTrue(sums.containsKey("AMER") && sums.containsKey("APAC") && sums.containsKey("EMEA")); + } + + @Test + @Order(12) + @DisplayName("BigQuery engine-only JavaPlanBuilder: readTable -> filter -> projection -> writeTable") + void javaPlanBuilderReadTableFilterProjectionTableSink() { + Assumptions.assumeTrue(available, "BigQuery not available"); + + new JavaPlanBuilder(wayangContext(), "BigQuery JavaPlanBuilder table sink test") + .readTable(new BigQueryTableSource(TABLE, "order_id", "region", "product", "amount")) + .filter(record -> "EMEA".equals(record.getField(1))) + .withSqlUdf("region = 'EMEA'") + .asRecords() + .projectRecords(new String[]{"order_id", "amount"}) + .writeTable(SINK_TABLE, "overwrite", new String[]{"order_id", "amount"}, new Properties()); + + assertEquals(3, queryLong("SELECT count(*) FROM " + SINK_TABLE), "sink holds 3 projected EMEA orders"); + } + + @Test + @Order(13) + @DisplayName("BigQuery engine-only JavaPlanBuilder: readTable + readTable -> join -> tableSink") + void javaPlanBuilderReadTableJoin() { + Assumptions.assumeTrue(available, "BigQuery not available"); + + JavaPlanBuilder plan = new JavaPlanBuilder(wayangContext(), "BigQuery JavaPlanBuilder join test"); + DataQuantaBuilder orders = plan.readTable(new BigQueryTableSource( + TABLE, "order_id", "region", "product", "amount")); + DataQuantaBuilder regions = plan.readTable(new BigQueryTableSource( + JOIN_TABLE, "region_name")); + + orders + .join( + record -> new Record(record.getField(1)), + regions, + record -> new Record(record.getField(0))) + .withSqlUdfs(TABLE, "region", JOIN_TABLE, "region_name") + .map(new JoinFlattenFunction()) + .withName(JOIN_FLATTEN_NAME) + .writeTable(SINK_TABLE, "overwrite", JOIN_COLUMNS, new Properties()); + + assertEquals(10, queryLong("SELECT count(*) FROM " + SINK_TABLE), + "join yields one row per order"); + assertEquals(0, queryLong("SELECT COUNTIF(region != region_name) FROM " + SINK_TABLE), + "joined regions should match"); + } + + // Helpers + + private WayangContext wayangContext() { + Configuration config = new Configuration(); + config.setProperty("wayang.bigquery.jdbc.url", JDBC_URL); + config.getMappingProvider().addAllToWhitelist( + Collections.singleton(new JoinFlattenMapping())); + return new WayangContext(config) + .withPlugin(BigQuery.plugin()); + } + + private TableSink tableSink(String... columnNames) { + return new TableSink<>(new Properties(), "overwrite", SINK_TABLE, columnNames); + } + + private static MapOperator, Record> joinFlattenOperator() { + MapOperator, Record> operator = new MapOperator<>( + new TransformationDescriptor<>( + new JoinFlattenFunction(), + DataUnitType.createBasicUnchecked(Tuple2.class), + DataUnitType.createBasic(Record.class)), + DataSetType.createDefaultUnchecked(Tuple2.class), + DataSetType.createDefault(Record.class)); + operator.setName(JOIN_FLATTEN_NAME); + return operator; + } + + private static Record flattenJoinResult(Object joinResult) { + if (joinResult instanceof Record) { + return (Record) joinResult; + } + Tuple2 pair = (Tuple2) joinResult; + Record left = (Record) pair.field0; + Record right = (Record) pair.field1; + return new Record( + left.getField(0), left.getField(1), left.getField(2), left.getField(3), + right.getField(0)); + } + + private long queryLong(String sql) { + try (Connection c = DriverManager.getConnection(JDBC_URL); ResultSet rs = c.createStatement().executeQuery(sql)) { + rs.next(); + return rs.getLong(1); + } catch (Exception e) { + throw new RuntimeException("query failed: " + sql, e); + } + } + + private double queryDouble(String sql) { + try (Connection c = DriverManager.getConnection(JDBC_URL); ResultSet rs = c.createStatement().executeQuery(sql)) { + rs.next(); + return rs.getDouble(1); + } catch (Exception e) { + throw new RuntimeException("query failed: " + sql, e); + } + } + + private int columnCount(String table) { + try (Connection c = DriverManager.getConnection(JDBC_URL); + ResultSet rs = c.createStatement().executeQuery("SELECT * FROM " + table + " LIMIT 1")) { + return rs.getMetaData().getColumnCount(); + } catch (Exception e) { + throw new RuntimeException("query failed: column count of " + table, e); + } + } + + private void assertSingleDoubleResult(double expected, String message) { + try (Connection c = DriverManager.getConnection(JDBC_URL); + ResultSet rs = c.createStatement().executeQuery("SELECT * FROM " + SINK_TABLE)) { + assertTrue(rs.next(), message); + assertEquals(expected, rs.getDouble(1), 0.01, message); + assertFalse(rs.next(), message); + } catch (Exception e) { + throw new RuntimeException("query failed: SELECT * FROM " + SINK_TABLE, e); + } + } + + private Map readRegionSums() { + Map sums = new HashMap<>(); + try (Connection c = DriverManager.getConnection(JDBC_URL); + ResultSet rs = c.createStatement().executeQuery("SELECT * FROM " + SINK_TABLE)) { + while (rs.next()) { + sums.put(rs.getString(1), rs.getDouble(2)); + } + return sums; + } catch (Exception e) { + throw new RuntimeException("query failed: SELECT * FROM " + SINK_TABLE, e); + } + } + + private static final class JoinFlattenFunction implements + FunctionDescriptor.SerializableFunction, Record> { + @Override + public Record apply(Tuple2 tuple) { + return flattenJoinResult(tuple); + } + } + + /** Test-only mapping for the unresolved logical join Tuple-to-Record mismatch. */ + @SuppressWarnings({"rawtypes", "unchecked"}) + private static final class JoinFlattenMapping implements Mapping { + @Override + public java.util.Collection getTransformations() { + OperatorPattern pattern = new OperatorPattern( + "joinFlatten", + new MapOperator(null, DataSetType.none(), DataSetType.createDefault(Record.class)), + false) + .withAdditionalTest(operator -> JOIN_FLATTEN_NAME.equals(((MapOperator) operator).getName())); + + ReplacementSubplanFactory factory = new ReplacementSubplanFactory.OfSingleOperators( + (matchedOperator, epoch) -> createBigQueryProjection().at(epoch)); + + return Collections.singleton(new PlanTransformation( + SubplanPattern.createSingleton(pattern), + factory, + BigQueryPlatform.getInstance())); + } + + private static BigQueryProjectionOperator createBigQueryProjection() { + ProjectionDescriptor, Record> descriptor = new ProjectionDescriptor<>( + new JoinFlattenFunction(), + Arrays.asList(JOIN_COLUMNS), + DataUnitType.createBasicUnchecked(Tuple2.class), + DataUnitType.createBasic(Record.class)); + MapOperator, Record> projection = new MapOperator<>( + descriptor, + DataSetType.createDefaultUnchecked(Tuple2.class), + DataSetType.createDefault(Record.class)); + projection.setName(JOIN_FLATTEN_NAME); + return new BigQueryProjectionOperator((MapOperator) (MapOperator) projection); + } + } +} diff --git a/wayang-platforms/wayang-jdbc-template/src/main/java/org/apache/wayang/jdbc/execution/JdbcExecutor.java b/wayang-platforms/wayang-jdbc-template/src/main/java/org/apache/wayang/jdbc/execution/JdbcExecutor.java index 401e331cd..6dd59a3a6 100644 --- a/wayang-platforms/wayang-jdbc-template/src/main/java/org/apache/wayang/jdbc/execution/JdbcExecutor.java +++ b/wayang-platforms/wayang-jdbc-template/src/main/java/org/apache/wayang/jdbc/execution/JdbcExecutor.java @@ -151,7 +151,9 @@ public static StringBuilder createSqlString(final JdbcExecutor jdbcExecutor, fin )); } - sb.append(';'); + // Intentionally no trailing ';'. A trailing semicolon is unnecessary for a + // single-statement JDBC executeQuery and is rejected by strict SQL parsers + // such as Trino and BigQuery. Postgres/SQLite/HSQLDB accept its absence. return sb; } @@ -167,7 +169,7 @@ protected static Tuple2 createSqlQuery(final E final Collection startTasks = stage.getStartTasks(); // Verify that we can handle this instance. - final ExecutionTask startTask = (ExecutionTask) startTasks.toArray()[0]; + final ExecutionTask startTask = JdbcExecutor.selectStartTask(startTasks, stage); assert startTask.getOperator() instanceof TableSource : "Invalid JDBC stage: Start task has to be a TableSource"; @@ -192,13 +194,16 @@ protected static Tuple2 createSqlQuery(final E } else if (operator instanceof JdbcProjectionOperator) { assert projectionTask == null; // Allow one projection operator per stage for now. projectionTask = (JdbcProjectionOperator) operator; - } else if (operator instanceof final JdbcGlobalReduceOperator globalReduce) { + } else if (operator instanceof JdbcGlobalReduceOperator) { + final JdbcGlobalReduceOperator globalReduce = (JdbcGlobalReduceOperator) operator; assert globalReduceTask == null; // Allow one projection operator per stage for now. globalReduceTask = globalReduce; - } else if (operator instanceof final JdbcReduceByOperator reduceBy) { + } else if (operator instanceof JdbcReduceByOperator) { + final JdbcReduceByOperator reduceBy = (JdbcReduceByOperator) operator; assert reduceByTask == null; // Allow one projection operator per stage for now. reduceByTask = reduceBy; - } else if (operator instanceof final JdbcSortOperator sort) { + } else if (operator instanceof JdbcSortOperator) { + final JdbcSortOperator sort = (JdbcSortOperator) operator; assert sortTask == null; // Allow one projection operator per stage for now. sortTask = sort; } else if (operator instanceof JoinOperator || (operator instanceof SpatialJoinOperator)) { @@ -221,6 +226,33 @@ protected static Tuple2 createSqlQuery(final E return new Tuple2<>(query.toString(), tipChannelInstance); } + /** + * Selects the table source that belongs on the left-hand side of a JDBC join. + * Stage start tasks are not ordered, but {@link JdbcJoinOperator#createSqlClause} + * assumes its first key descriptor's table is used in the {@code FROM} clause. + */ + private static ExecutionTask selectStartTask(final Collection startTasks, final ExecutionStage stage) { + if (startTasks.size() == 1) { + return (ExecutionTask) startTasks.iterator().next(); + } + + for (ExecutionTask task : stage.getAllTasks()) { + if (task.getOperator() instanceof JdbcJoinOperator) { + final JdbcJoinOperator joinOperator = (JdbcJoinOperator) task.getOperator(); + final String leftTableName = joinOperator.getKeyDescriptor0().getSqlImplementation().field0; + for (Object startTaskObject : startTasks) { + final ExecutionTask startTask = (ExecutionTask) startTaskObject; + if (startTask.getOperator() instanceof JdbcTableSource + && ((JdbcTableSource) startTask.getOperator()).getTableName().equals(leftTableName)) { + return startTask; + } + } + } + } + + throw new WayangException("Could not determine the left table source for JDBC stage."); + } + /** * Handles execution stages that end with a {@link JdbcTableSinkOperator}. * Composes a SQL query from the stage's operators and executes it directly on @@ -235,8 +267,7 @@ private static void executeSinkStage(final ExecutionStage stage, final Optimizat final Collection startTasks = stage.getStartTasks(); final Collection termTasks = stage.getTerminalTasks(); - assert startTasks.size() == 1 : "Invalid JDBC stage: multiple sources are not currently supported"; - final ExecutionTask startTask = (ExecutionTask) startTasks.toArray()[0]; + final ExecutionTask startTask = JdbcExecutor.selectStartTask(startTasks, stage); assert termTasks.size() == 1 : "Invalid JDBC stage: multiple terminal tasks are not currently supported."; final ExecutionTask termTask = (ExecutionTask) termTasks.toArray()[0]; assert startTask.getOperator() instanceof TableSource @@ -249,17 +280,35 @@ private static void executeSinkStage(final ExecutionStage stage, final Optimizat final JdbcTableSinkOperator sinkOp = (JdbcTableSinkOperator) termTask.getOperator(); final Collection filterTasks = new ArrayList<>(4); JdbcProjectionOperator projectionTask = null; + JdbcGlobalReduceOperator globalReduceTask = null; + JdbcReduceByOperator reduceByTask = null; + JdbcSortOperator sortTask = null; final Collection joinTasks = new ArrayList<>(); // Walk through intermediate operators, stopping at the sink ExecutionTask nextTask = JdbcExecutor.findJdbcExecutionOperatorTaskInStage(startTask, stage); while (nextTask != null && !(nextTask.getOperator() instanceof JdbcTableSinkOperator)) { - if (nextTask.getOperator() instanceof final JdbcFilterOperator filterOperator) { + if (nextTask.getOperator() instanceof JdbcFilterOperator) { + final JdbcFilterOperator filterOperator = (JdbcFilterOperator) nextTask.getOperator(); filterTasks.add(filterOperator); - } else if (nextTask.getOperator() instanceof final JdbcProjectionOperator projectionOperator) { + } else if (nextTask.getOperator() instanceof JdbcProjectionOperator) { + final JdbcProjectionOperator projectionOperator = (JdbcProjectionOperator) nextTask.getOperator(); assert projectionTask == null; projectionTask = projectionOperator; - } else if (nextTask.getOperator() instanceof final JdbcJoinOperator joinOperator) { + } else if (nextTask.getOperator() instanceof JdbcGlobalReduceOperator) { + final JdbcGlobalReduceOperator globalReduceOperator = (JdbcGlobalReduceOperator) nextTask.getOperator(); + assert globalReduceTask == null; + globalReduceTask = globalReduceOperator; + } else if (nextTask.getOperator() instanceof JdbcReduceByOperator) { + final JdbcReduceByOperator reduceByOperator = (JdbcReduceByOperator) nextTask.getOperator(); + assert reduceByTask == null; + reduceByTask = reduceByOperator; + } else if (nextTask.getOperator() instanceof JdbcSortOperator) { + final JdbcSortOperator sortOperator = (JdbcSortOperator) nextTask.getOperator(); + assert sortTask == null; + sortTask = sortOperator; + } else if (nextTask.getOperator() instanceof JdbcJoinOperator) { + final JdbcJoinOperator joinOperator = (JdbcJoinOperator) nextTask.getOperator(); joinTasks.add(joinOperator); } else { throw new WayangException(String.format("Unsupported JDBC execution task %s", nextTask.toString())); @@ -268,8 +317,8 @@ private static void executeSinkStage(final ExecutionStage stage, final Optimizat } // Compose the SELECT query - final StringBuilder selectQuery = createSqlString(jdbcExecutor, tableOp, filterTasks, projectionTask, null, null, null, - joinTasks); + final StringBuilder selectQuery = createSqlString(jdbcExecutor, tableOp, filterTasks, projectionTask, + globalReduceTask, reduceByTask, sortTask, joinTasks); // Remove trailing semicolon from SELECT String selectSql = selectQuery.toString(); diff --git a/wayang-platforms/wayang-jdbc-template/src/main/java/org/apache/wayang/jdbc/operators/JdbcTableSource.java b/wayang-platforms/wayang-jdbc-template/src/main/java/org/apache/wayang/jdbc/operators/JdbcTableSource.java index 2d546deec..4d7096649 100644 --- a/wayang-platforms/wayang-jdbc-template/src/main/java/org/apache/wayang/jdbc/operators/JdbcTableSource.java +++ b/wayang-platforms/wayang-jdbc-template/src/main/java/org/apache/wayang/jdbc/operators/JdbcTableSource.java @@ -81,7 +81,8 @@ public CardinalityEstimate estimate(OptimizationContext optimizationContext, Car .createJdbcConnection()) { // Query the table cardinality. - final String sql = String.format("SELECT count(*) FROM %s;", JdbcTableSource.this.getTableName()); + // No trailing ';' — strict parsers (Trino, BigQuery) reject it in executeQuery. + final String sql = String.format("SELECT count(*) FROM %s", JdbcTableSource.this.getTableName()); final ResultSet resultSet = connection.createStatement().executeQuery(sql); if (!resultSet.next()) { throw new SQLException("No query result for \"" + sql + "\"."); diff --git a/wayang-platforms/wayang-jdbc-template/src/test/java/org/apache/wayang/jdbc/execution/JdbcExecutorTest.java b/wayang-platforms/wayang-jdbc-template/src/test/java/org/apache/wayang/jdbc/execution/JdbcExecutorTest.java index 0dfd8b698..8f7b3d8a2 100644 --- a/wayang-platforms/wayang-jdbc-template/src/test/java/org/apache/wayang/jdbc/execution/JdbcExecutorTest.java +++ b/wayang-platforms/wayang-jdbc-template/src/test/java/org/apache/wayang/jdbc/execution/JdbcExecutorTest.java @@ -81,7 +81,7 @@ void testExecuteWithPlainTableSource() throws SQLException { SqlQueryChannel.Instance sqlQueryChannelInstance = (SqlQueryChannel.Instance) job.getCrossPlatformExecutor().getChannelInstance(sqlToStreamTask.getInputChannel(0)); assertEquals( - "SELECT * FROM customer;", + "SELECT * FROM customer", sqlQueryChannelInstance.getSqlQuery() ); } @@ -130,7 +130,7 @@ void testExecuteWithFilter() throws SQLException { SqlQueryChannel.Instance sqlQueryChannelInstance = (SqlQueryChannel.Instance) job.getCrossPlatformExecutor().getChannelInstance(sqlToStreamTask.getInputChannel(0)); assertEquals( - "SELECT * FROM customer WHERE age >= 18;", + "SELECT * FROM customer WHERE age >= 18", sqlQueryChannelInstance.getSqlQuery() ); } @@ -172,7 +172,7 @@ void testExecuteWithProjection() throws SQLException { SqlQueryChannel.Instance sqlQueryChannelInstance = (SqlQueryChannel.Instance) job.getCrossPlatformExecutor().getChannelInstance(sqlToStreamTask.getInputChannel(0)); assertEquals( - "SELECT name, age FROM customer;", + "SELECT name, age FROM customer", sqlQueryChannelInstance.getSqlQuery() ); } @@ -240,7 +240,7 @@ void testExecuteWithProjectionAndFilters() throws SQLException { SqlQueryChannel.Instance sqlQueryChannelInstance = (SqlQueryChannel.Instance) job.getCrossPlatformExecutor().getChannelInstance(sqlToStreamTask.getInputChannel(0)); assertEquals( - "SELECT name, age FROM customer WHERE age >= 18 AND name IS NOT NULL;", + "SELECT name, age FROM customer WHERE age >= 18 AND name IS NOT NULL", sqlQueryChannelInstance.getSqlQuery() ); } diff --git a/wayang-platforms/wayang-jdbc-template/src/test/java/org/apache/wayang/jdbc/operators/JdbcGlobalReduceOperatorTest.java b/wayang-platforms/wayang-jdbc-template/src/test/java/org/apache/wayang/jdbc/operators/JdbcGlobalReduceOperatorTest.java index b5ccb0848..739e38896 100644 --- a/wayang-platforms/wayang-jdbc-template/src/test/java/org/apache/wayang/jdbc/operators/JdbcGlobalReduceOperatorTest.java +++ b/wayang-platforms/wayang-jdbc-template/src/test/java/org/apache/wayang/jdbc/operators/JdbcGlobalReduceOperatorTest.java @@ -62,7 +62,7 @@ void testWithHsqldb() throws SQLException { final ExecutionStage sqlStage = mock(ExecutionStage.class); final JdbcTableSource tableSourceA = new HsqldbTableSource("testA"); - + final ExecutionTask tableSourceATask = new ExecutionTask(tableSourceA); tableSourceATask.setOutputChannel(0, new SqlQueryChannel(sqlChannelDescriptor, tableSourceA.getOutput(0))); tableSourceATask.setStage(sqlStage); @@ -86,15 +86,12 @@ void testWithHsqldb() throws SQLException { globalReduceTask.getOutputChannel(0).addConsumer(sqlToStreamTask, 0); sqlToStreamTask.setStage(nextStage); - final HsqldbPlatform hsqldbPlatform = new HsqldbPlatform(); try (Connection jdbcConnection = hsqldbPlatform.createDatabaseDescriptor(configuration).createJdbcConnection()) { final Statement statement = jdbcConnection.createStatement(); statement.execute("CREATE TABLE IF NOT EXISTS testA (a INT, b VARCHAR(6));"); statement.execute("INSERT INTO testA VALUES (0, 'zero');"); - statement.execute("CREATE TABLE IF NOT EXISTS testB (a INT, b INT);"); - statement.execute("INSERT INTO testB VALUES (0, 100);"); } final JdbcExecutor executor = new JdbcExecutor(HsqldbPlatform.getInstance(), job); @@ -112,6 +109,6 @@ void testWithHsqldb() throws SQLException { assertTrue(count > 0); } - assertEquals("SELECT COUNT(*) FROM testA;", sqlQueryChannelInstance.getSqlQuery()); + assertEquals("SELECT COUNT(*) FROM testA", sqlQueryChannelInstance.getSqlQuery()); } } diff --git a/wayang-platforms/wayang-jdbc-template/src/test/java/org/apache/wayang/jdbc/operators/JdbcJoinOperatorTest.java b/wayang-platforms/wayang-jdbc-template/src/test/java/org/apache/wayang/jdbc/operators/JdbcJoinOperatorTest.java index 875a7a47b..d56405b19 100644 --- a/wayang-platforms/wayang-jdbc-template/src/test/java/org/apache/wayang/jdbc/operators/JdbcJoinOperatorTest.java +++ b/wayang-platforms/wayang-jdbc-template/src/test/java/org/apache/wayang/jdbc/operators/JdbcJoinOperatorTest.java @@ -39,7 +39,9 @@ import java.sql.Connection; import java.sql.SQLException; import java.sql.Statement; +import java.util.Arrays; import java.util.Collections; +import java.util.LinkedHashSet; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.mockito.Mockito.mock; @@ -116,7 +118,12 @@ void testWithHsqldb() throws SQLException { joinTask.setOutputChannel(0, new SqlQueryChannel(sqlChannelDescriptor, joinOperator.getOutput(0))); joinTask.setStage(sqlStage); - when(sqlStage.getStartTasks()).thenReturn(Collections.singleton(tableSourceATask)); + // Deliberately list the right source first: JdbcExecutor must still choose + // the join's left source for the FROM clause. + when(sqlStage.getStartTasks()).thenReturn(new LinkedHashSet<>(Arrays.asList( + tableSourceBTask, tableSourceATask))); + when(sqlStage.getAllTasks()).thenReturn(new LinkedHashSet<>(Arrays.asList( + tableSourceBTask, tableSourceATask, joinTask))); when(sqlStage.getTerminalTasks()).thenReturn(Collections.singleton(joinTask)); ExecutionStage nextStage = mock(ExecutionStage.class); @@ -135,7 +142,7 @@ void testWithHsqldb() throws SQLException { System.out.println(); assertEquals( - "SELECT * FROM testA JOIN testB ON testB.a=testA.a;", + "SELECT * FROM testA JOIN testB ON testB.a=testA.a", sqlQueryChannelInstance.getSqlQuery() ); } @@ -213,7 +220,7 @@ void testMultiConditionJoinWithHsqldb() throws SQLException { String generatedSql = sqlQueryChannelInstance.getSqlQuery(); assertEquals( - "SELECT * FROM orders JOIN shipments ON orders.order_id=shipments.order_id AND orders.customer_id=shipments.customer_id;", + "SELECT * FROM orders JOIN shipments ON orders.order_id=shipments.order_id AND orders.customer_id=shipments.customer_id", generatedSql ); diff --git a/wayang-platforms/wayang-jdbc-template/src/test/java/org/apache/wayang/jdbc/operators/JdbcReduceByOperatorTest.java b/wayang-platforms/wayang-jdbc-template/src/test/java/org/apache/wayang/jdbc/operators/JdbcReduceByOperatorTest.java index f00f4020e..556224027 100644 --- a/wayang-platforms/wayang-jdbc-template/src/test/java/org/apache/wayang/jdbc/operators/JdbcReduceByOperatorTest.java +++ b/wayang-platforms/wayang-jdbc-template/src/test/java/org/apache/wayang/jdbc/operators/JdbcReduceByOperatorTest.java @@ -91,6 +91,6 @@ void testWithHsqldb() throws SQLException { final SqlQueryChannel.Instance sqlQueryChannelInstance = (SqlQueryChannel.Instance) job.getCrossPlatformExecutor() .getChannelInstance(sqlToStreamTask.getInputChannel(0)); - assertEquals("SELECT col0,COUNT(*) FROM testA GROUP BY col0;", sqlQueryChannelInstance.getSqlQuery()); + assertEquals("SELECT col0,COUNT(*) FROM testA GROUP BY col0", sqlQueryChannelInstance.getSqlQuery()); } } diff --git a/wayang-platforms/wayang-jdbc-template/src/test/java/org/apache/wayang/jdbc/operators/JdbcSortOperatorTest.java b/wayang-platforms/wayang-jdbc-template/src/test/java/org/apache/wayang/jdbc/operators/JdbcSortOperatorTest.java index 118fb7efa..1dc2fe12f 100644 --- a/wayang-platforms/wayang-jdbc-template/src/test/java/org/apache/wayang/jdbc/operators/JdbcSortOperatorTest.java +++ b/wayang-platforms/wayang-jdbc-template/src/test/java/org/apache/wayang/jdbc/operators/JdbcSortOperatorTest.java @@ -86,6 +86,6 @@ void testWithHsqldb() throws SQLException { final SqlQueryChannel.Instance sqlQueryChannelInstance = (SqlQueryChannel.Instance) job.getCrossPlatformExecutor() .getChannelInstance(sqlToStreamTask.getInputChannel(0)); - assertEquals("SELECT * FROM testA ORDER BY col0 DESC;", sqlQueryChannelInstance.getSqlQuery()); + assertEquals("SELECT * FROM testA ORDER BY col0 DESC", sqlQueryChannelInstance.getSqlQuery()); } }